Update to how similar messages matching is handled when updating po files from pot one (gain something like 20% in heavy update situations, and save a nice bunch of memory!).
This commit is contained in:
@@ -387,7 +387,7 @@ def dump_py_messages_from_files(messages, check_ctxt, files):
|
|||||||
estr_ls.append(estr)
|
estr_ls.append(estr)
|
||||||
nds_ls.extend(nds)
|
nds_ls.extend(nds)
|
||||||
ret = _extract_string_merge(estr_ls, nds_ls)
|
ret = _extract_string_merge(estr_ls, nds_ls)
|
||||||
print(ret)
|
#print(ret)
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
def extract_strings_split(node):
|
def extract_strings_split(node):
|
||||||
|
|||||||
@@ -225,6 +225,7 @@ dict_uimsgs = {
|
|||||||
"loc", "rot", "pos",
|
"loc", "rot", "pos",
|
||||||
"lorem",
|
"lorem",
|
||||||
"luma",
|
"luma",
|
||||||
|
"mem",
|
||||||
"multicam",
|
"multicam",
|
||||||
"num",
|
"num",
|
||||||
"ok",
|
"ok",
|
||||||
|
|||||||
@@ -97,10 +97,13 @@ def main():
|
|||||||
if os.path.exists(po):
|
if os.path.exists(po):
|
||||||
pool_data.append((po, lang, pot_msgs))
|
pool_data.append((po, lang, pot_msgs))
|
||||||
|
|
||||||
with concurrent.futures.ProcessPoolExecutor() as executor:
|
for r in map(process_po, pool_data):
|
||||||
for r in executor.map(process_po, pool_data, timeout=600):
|
if r != 0:
|
||||||
if r != 0:
|
ret = r
|
||||||
ret = r
|
#with concurrent.futures.ProcessPoolExecutor() as executor:
|
||||||
|
#for r in executor.map(process_po, pool_data, timeout=600):
|
||||||
|
#if r != 0:
|
||||||
|
#ret = r
|
||||||
|
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
|
|||||||
@@ -21,6 +21,7 @@
|
|||||||
# Some misc utilities...
|
# Some misc utilities...
|
||||||
|
|
||||||
import collections
|
import collections
|
||||||
|
import concurrent.futures
|
||||||
import copy
|
import copy
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
@@ -61,6 +62,35 @@ def is_valid_po_path(path):
|
|||||||
return bool(_valid_po_path_re.match(path))
|
return bool(_valid_po_path_re.match(path))
|
||||||
|
|
||||||
|
|
||||||
|
def get_best_similar(data):
|
||||||
|
import difflib
|
||||||
|
key, use_similar, similar_pool = data
|
||||||
|
|
||||||
|
# try to find some close key in existing messages...
|
||||||
|
# Optimized code inspired by difflib.get_close_matches (as we only need the best match).
|
||||||
|
# We also consider to never make a match when len differs more than -len_key / 2, +len_key * 2 (which is valid
|
||||||
|
# as long as use_similar is not below ~0.7).
|
||||||
|
# Gives an overall ~20% of improvement!
|
||||||
|
#tmp = difflib.get_close_matches(key[1], similar_pool, n=1, cutoff=use_similar)
|
||||||
|
#if tmp:
|
||||||
|
#tmp = tmp[0]
|
||||||
|
tmp = None
|
||||||
|
s = difflib.SequenceMatcher()
|
||||||
|
s.set_seq2(key[1])
|
||||||
|
len_key = len(key[1])
|
||||||
|
min_len = len_key // 2
|
||||||
|
max_len = len_key * 2
|
||||||
|
for x in similar_pool:
|
||||||
|
if min_len < len(x) < max_len:
|
||||||
|
s.set_seq1(x)
|
||||||
|
if s.real_quick_ratio() >= use_similar and s.quick_ratio() >= use_similar:
|
||||||
|
sratio = s.ratio()
|
||||||
|
if sratio >= use_similar:
|
||||||
|
tmp = x
|
||||||
|
use_similar = sratio
|
||||||
|
return key, tmp
|
||||||
|
|
||||||
|
|
||||||
class I18nMessage:
|
class I18nMessage:
|
||||||
"""
|
"""
|
||||||
Internal representation of a message.
|
Internal representation of a message.
|
||||||
@@ -233,40 +263,73 @@ class I18nMessages:
|
|||||||
existing one. Messages no more found in ref will be marked as commented if keep_old_commented is True,
|
existing one. Messages no more found in ref will be marked as commented if keep_old_commented is True,
|
||||||
or removed.
|
or removed.
|
||||||
"""
|
"""
|
||||||
import difflib
|
|
||||||
similar_pool = {}
|
similar_pool = {}
|
||||||
if use_similar > 0.0:
|
if use_similar > 0.0:
|
||||||
for key, msg in self.msgs.items():
|
for key, msg in self.msgs.items():
|
||||||
if msg.msgstr: # No need to waste time with void translations!
|
if msg.msgstr: # No need to waste time with void translations!
|
||||||
similar_pool.setdefault(key[1], set()).add(key)
|
similar_pool.setdefault(key[1], set()).add(key)
|
||||||
|
|
||||||
msgs = self._new_messages()
|
msgs = self._new_messages().fromkeys(ref.msgs.keys())
|
||||||
for (key, msg) in ref.msgs.items():
|
ref_keys = set(ref.msgs.keys())
|
||||||
if key in self.msgs:
|
org_keys = set(self.msgs.keys())
|
||||||
msgs[key] = self.msgs[key]
|
new_keys = ref_keys - org_keys
|
||||||
msgs[key].sources = msg.sources
|
removed_keys = org_keys - ref_keys
|
||||||
else:
|
|
||||||
skey = None
|
print(new_keys, "\n\n", removed_keys)
|
||||||
if use_similar > 0.0:
|
|
||||||
# try to find some close key in existing messages...
|
# First process keys present in both org and ref messages.
|
||||||
tmp = difflib.get_close_matches(key[1], similar_pool, n=1, cutoff=use_similar)
|
for key in ref_keys - new_keys:
|
||||||
if tmp:
|
msg, refmsg = self.msgs[key], ref.msgs[key]
|
||||||
tmp = tmp[0]
|
msg.sources = refmsg.sources
|
||||||
|
msg.is_commented = refmsg.is_commented
|
||||||
|
msg.is_fuzzy = refmsg.is_fuzzy
|
||||||
|
msgs[key] = msg
|
||||||
|
|
||||||
|
# Next process new keys.
|
||||||
|
if use_similar > 0.0:
|
||||||
|
with concurrent.futures.ProcessPoolExecutor() as exctr:
|
||||||
|
for key, msgid in exctr.map(get_best_similar,
|
||||||
|
tuple((nk, use_similar, tuple(similar_pool.keys())) for nk in new_keys)):
|
||||||
|
if msgid:
|
||||||
# Try to get the same context, else just get one...
|
# Try to get the same context, else just get one...
|
||||||
skey = (key[0], tmp)
|
skey = (key[0], msgid)
|
||||||
if skey not in similar_pool[tmp]:
|
if skey not in similar_pool[msgid]:
|
||||||
skey = tuple(similar_pool[tmp])[0]
|
skey = tuple(similar_pool[msgid])[0]
|
||||||
msgs[key] = msg
|
# We keep org translation and comments, and mark message as fuzzy.
|
||||||
if skey:
|
msg, refmsg = copy.deepcopy(self.msgs[skey]), ref.msgs[key]
|
||||||
msgs[key].msgstr = self.msgs[skey].msgstr
|
msg.msgctxt = refmsg.msgctxt
|
||||||
msgs[key].is_fuzzy = True
|
msg.msgid = refmsg.msgid
|
||||||
|
msg.sources = refmsg.sources
|
||||||
|
msg.is_fuzzy = True
|
||||||
|
msg.is_commented = refmsg.is_commented
|
||||||
|
msgs[key] = msg
|
||||||
|
else:
|
||||||
|
msgs[key] = ref.msgs[key]
|
||||||
|
else:
|
||||||
|
for key in new_keys:
|
||||||
|
msgs[key] = ref.msgs[key]
|
||||||
|
|
||||||
# Add back all "old" and already commented messages as commented ones, if required
|
# Add back all "old" and already commented messages as commented ones, if required
|
||||||
# (and translation was not void!).
|
# (and translation was not void!).
|
||||||
if keep_old_commented:
|
if keep_old_commented:
|
||||||
for key, msg in self.msgs.items():
|
for key in removed_keys:
|
||||||
if key not in msgs and msg.msgstr:
|
msgs[key] = self.msgs[key]
|
||||||
msgs[key] = msg
|
msgs[key].is_commented = True
|
||||||
msgs[key].is_commented = True
|
msgs[key].sources = []
|
||||||
|
|
||||||
|
# Special 'meta' message, change project ID version and pot creation date...
|
||||||
|
key = ("", "")
|
||||||
|
rep = []
|
||||||
|
markers = ("Project-Id-Version:", "POT-Creation-Date:")
|
||||||
|
for mrk in markers:
|
||||||
|
for rl in ref.msgs[key].msgstr_lines:
|
||||||
|
if rl.startswith(mrk):
|
||||||
|
for idx, ml in enumerate(msgs[key].msgstr_lines):
|
||||||
|
if ml.startswith(mrk):
|
||||||
|
rep.append((idx, rl))
|
||||||
|
for idx, txt in rep:
|
||||||
|
msgs[key].msgstr_lines[idx] = txt
|
||||||
|
|
||||||
# And finalize the update!
|
# And finalize the update!
|
||||||
self.msgs = msgs
|
self.msgs = msgs
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user