From 7e1e7cc86a6cb8d7fded1d6163eba1e5c1ee3fcb Mon Sep 17 00:00:00 2001 From: Proyag Date: Mon, 7 Oct 2019 14:04:23 +0200 Subject: [PATCH 1/5] remove unused function arguments --- docker/app.py | 1 - source/embed.py | 2 +- source/lib/text_processing.py | 8 +++----- source/paraphrase.py | 2 +- source/similarity_search.py | 1 - tasks/mldoc/mldoc.py | 2 +- tasks/xnli/xnli.py | 4 ++-- 7 files changed, 8 insertions(+), 12 deletions(-) diff --git a/docker/app.py b/docker/app.py index a5574b9a..380e02de 100644 --- a/docker/app.py +++ b/docker/app.py @@ -51,7 +51,6 @@ def vectorize(): str(tok_fname), lang=lang, romanize=True if lang == 'el' else False, - lower_case=True, gzip=False, verbose=True, over_write=False) diff --git a/source/embed.py b/source/embed.py index c775ffce..09533529 100644 --- a/source/embed.py +++ b/source/embed.py @@ -370,7 +370,7 @@ def EmbedMmap(fname, dim=1024, dtype=np.float32, verbose=False): tok_fname, lang=args.token_lang, romanize=True if args.token_lang == 'el' else False, - lower_case=True, gzip=False, + gzip=False, verbose=args.verbose, over_write=False) ifname = tok_fname diff --git a/source/lib/text_processing.py b/source/lib/text_processing.py index f262de5f..9e2a563a 100644 --- a/source/lib/text_processing.py +++ b/source/lib/text_processing.py @@ -46,8 +46,7 @@ # ############################################################################### -def TokenLine(line, lang='en', lower_case=True, romanize=False): - assert lower_case, 'lower case is needed by all the models' +def TokenLine(line, lang='en', romanize=False): roman = lang if romanize else 'none' tok = check_output( REM_NON_PRINT_CHAR @@ -70,9 +69,8 @@ def TokenLine(line, lang='en', lower_case=True, romanize=False): ############################################################################### def Token(inp_fname, out_fname, lang='en', - lower_case=True, romanize=False, descape=False, + romanize=False, descape=False, verbose=False, over_write=False, gzip=False): - assert lower_case, 'lower case is needed by all the models' assert not over_write, 'over-write is not yet implemented' if not os.path.isfile(out_fname): cat = 'zcat ' if gzip else 'cat ' @@ -110,7 +108,7 @@ def Token(inp_fname, out_fname, lang='en', # ############################################################################### -def BPEfastLoad(line, bpe_codes): +def BPEfastLoad(bpe_codes): bpe_vocab = bpe_codes.replace('fcodes', 'fvocab') return fastBPE.fastBPE(bpe_codes, bpe_vocab) diff --git a/source/paraphrase.py b/source/paraphrase.py index 5877805e..bbb8c1ca 100644 --- a/source/paraphrase.py +++ b/source/paraphrase.py @@ -247,7 +247,7 @@ def buffered_read(fp, buffer_size): ifile, lang=args.token_lang, romanize=True if args.token_lang == 'el' else False, - lower_case=True, gzip=False, + gzip=False, verbose=args.verbose, over_write=False) if args.bpe_codes: diff --git a/source/similarity_search.py b/source/similarity_search.py index 5024e91f..2b76c8aa 100644 --- a/source/similarity_search.py +++ b/source/similarity_search.py @@ -92,7 +92,6 @@ os.path.join(args.base_dir, args.output + '.tok.' + l), lang=l, romanize=True if l == 'el' else False, - lower_case=True, verbose=args.verbose, over_write=False) BPEfastApply(os.path.join(args.base_dir, args.output + '.tok.' + l), os.path.join(args.base_dir, args.output + '.bpe.' + l), diff --git a/tasks/mldoc/mldoc.py b/tasks/mldoc/mldoc.py index a6829816..3c0980ff 100644 --- a/tasks/mldoc/mldoc.py +++ b/tasks/mldoc/mldoc.py @@ -82,7 +82,7 @@ cfname + '.tok.' + lang, lang=lang, romanize=(True if lang == 'el' else False), - lower_case=True, gzip=False, + gzip=False, verbose=args.verbose, over_write=False) SplitLines(cfname + '.tok.' + lang, cfname + '.split.' + lang, diff --git a/tasks/xnli/xnli.py b/tasks/xnli/xnli.py index 918a0b73..3b965561 100644 --- a/tasks/xnli/xnli.py +++ b/tasks/xnli/xnli.py @@ -78,7 +78,7 @@ cfname + 'tok.' + lang, lang=lang, romanize=True if lang=='el' else False, - lower_case=True, gzip=True, + gzip=True, verbose=args.verbose, over_write=False) BPEfastApply(cfname + 'tok.' + lang, cfname + 'bpe.' + lang, @@ -99,7 +99,7 @@ cfname + 'tok.' + lang, lang=lang, romanize=True if lang=='el' else False, - lower_case=True, gzip=False, + gzip=False, verbose=args.verbose, over_write=False) BPEfastApply(cfname + 'tok.' + lang, cfname + 'bpe.' + lang, From 84369600e1a69fb45b8f34ddebb01452fcf9fc85 Mon Sep 17 00:00:00 2001 From: Proyag Date: Mon, 7 Oct 2019 14:09:54 +0200 Subject: [PATCH 2/5] remove unused imports --- source/mine_bitexts.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/source/mine_bitexts.py b/source/mine_bitexts.py index 18137bf5..3a58c37f 100644 --- a/source/mine_bitexts.py +++ b/source/mine_bitexts.py @@ -28,8 +28,7 @@ sys.path.append(LASER + '/source') sys.path.append(LASER + '/source/tools') -from embed import SentenceEncoder, EncodeLoad, EncodeFile, EmbedLoad -from text_processing import Token, BPEfastApply +from embed import EmbedLoad ############################################################################### From 7ed45f9215e349e7107fbb42a2a67d81c3d643b4 Mon Sep 17 00:00:00 2001 From: Proyag Date: Mon, 7 Oct 2019 14:50:41 +0200 Subject: [PATCH 3/5] remove unused pre-processing script path --- source/lib/text_processing.py | 1 - 1 file changed, 1 deletion(-) diff --git a/source/lib/text_processing.py b/source/lib/text_processing.py index 9e2a563a..505d71fd 100644 --- a/source/lib/text_processing.py +++ b/source/lib/text_processing.py @@ -28,7 +28,6 @@ FASTBPE = LASER + '/tools-external/fastBPE/fast' MOSES_BDIR = LASER + '/tools-external/moses-tokenizer/tokenizer/' MOSES_TOKENIZER = MOSES_BDIR + 'tokenizer.perl -q -no-escape -threads 20 -l ' -MOSES_LC = MOSES_BDIR + 'lowercase.perl' NORM_PUNC = MOSES_BDIR + 'normalize-punctuation.perl -l ' DESCAPE = MOSES_BDIR + 'deescape-special-chars.perl' REM_NON_PRINT_CHAR = MOSES_BDIR + 'remove-non-printing-char.perl' From 017a34e53709fe43f30481265c31e71839fc4ae2 Mon Sep 17 00:00:00 2001 From: Proyag Date: Tue, 22 Oct 2019 11:30:15 +0200 Subject: [PATCH 4/5] Revert "remove unused function arguments" This reverts commit 7e1e7cc86a6cb8d7fded1d6163eba1e5c1ee3fcb. --- docker/app.py | 1 + source/embed.py | 2 +- source/lib/text_processing.py | 8 +++++--- source/paraphrase.py | 2 +- source/similarity_search.py | 1 + tasks/mldoc/mldoc.py | 2 +- tasks/xnli/xnli.py | 4 ++-- 7 files changed, 12 insertions(+), 8 deletions(-) diff --git a/docker/app.py b/docker/app.py index 380e02de..a5574b9a 100644 --- a/docker/app.py +++ b/docker/app.py @@ -51,6 +51,7 @@ def vectorize(): str(tok_fname), lang=lang, romanize=True if lang == 'el' else False, + lower_case=True, gzip=False, verbose=True, over_write=False) diff --git a/source/embed.py b/source/embed.py index 09533529..c775ffce 100644 --- a/source/embed.py +++ b/source/embed.py @@ -370,7 +370,7 @@ def EmbedMmap(fname, dim=1024, dtype=np.float32, verbose=False): tok_fname, lang=args.token_lang, romanize=True if args.token_lang == 'el' else False, - gzip=False, + lower_case=True, gzip=False, verbose=args.verbose, over_write=False) ifname = tok_fname diff --git a/source/lib/text_processing.py b/source/lib/text_processing.py index 505d71fd..204cd848 100644 --- a/source/lib/text_processing.py +++ b/source/lib/text_processing.py @@ -45,7 +45,8 @@ # ############################################################################### -def TokenLine(line, lang='en', romanize=False): +def TokenLine(line, lang='en', lower_case=True, romanize=False): + assert lower_case, 'lower case is needed by all the models' roman = lang if romanize else 'none' tok = check_output( REM_NON_PRINT_CHAR @@ -68,8 +69,9 @@ def TokenLine(line, lang='en', romanize=False): ############################################################################### def Token(inp_fname, out_fname, lang='en', - romanize=False, descape=False, + lower_case=True, romanize=False, descape=False, verbose=False, over_write=False, gzip=False): + assert lower_case, 'lower case is needed by all the models' assert not over_write, 'over-write is not yet implemented' if not os.path.isfile(out_fname): cat = 'zcat ' if gzip else 'cat ' @@ -107,7 +109,7 @@ def Token(inp_fname, out_fname, lang='en', # ############################################################################### -def BPEfastLoad(bpe_codes): +def BPEfastLoad(line, bpe_codes): bpe_vocab = bpe_codes.replace('fcodes', 'fvocab') return fastBPE.fastBPE(bpe_codes, bpe_vocab) diff --git a/source/paraphrase.py b/source/paraphrase.py index bbb8c1ca..5877805e 100644 --- a/source/paraphrase.py +++ b/source/paraphrase.py @@ -247,7 +247,7 @@ def buffered_read(fp, buffer_size): ifile, lang=args.token_lang, romanize=True if args.token_lang == 'el' else False, - gzip=False, + lower_case=True, gzip=False, verbose=args.verbose, over_write=False) if args.bpe_codes: diff --git a/source/similarity_search.py b/source/similarity_search.py index 2b76c8aa..5024e91f 100644 --- a/source/similarity_search.py +++ b/source/similarity_search.py @@ -92,6 +92,7 @@ os.path.join(args.base_dir, args.output + '.tok.' + l), lang=l, romanize=True if l == 'el' else False, + lower_case=True, verbose=args.verbose, over_write=False) BPEfastApply(os.path.join(args.base_dir, args.output + '.tok.' + l), os.path.join(args.base_dir, args.output + '.bpe.' + l), diff --git a/tasks/mldoc/mldoc.py b/tasks/mldoc/mldoc.py index 3c0980ff..a6829816 100644 --- a/tasks/mldoc/mldoc.py +++ b/tasks/mldoc/mldoc.py @@ -82,7 +82,7 @@ cfname + '.tok.' + lang, lang=lang, romanize=(True if lang == 'el' else False), - gzip=False, + lower_case=True, gzip=False, verbose=args.verbose, over_write=False) SplitLines(cfname + '.tok.' + lang, cfname + '.split.' + lang, diff --git a/tasks/xnli/xnli.py b/tasks/xnli/xnli.py index 3b965561..918a0b73 100644 --- a/tasks/xnli/xnli.py +++ b/tasks/xnli/xnli.py @@ -78,7 +78,7 @@ cfname + 'tok.' + lang, lang=lang, romanize=True if lang=='el' else False, - gzip=True, + lower_case=True, gzip=True, verbose=args.verbose, over_write=False) BPEfastApply(cfname + 'tok.' + lang, cfname + 'bpe.' + lang, @@ -99,7 +99,7 @@ cfname + 'tok.' + lang, lang=lang, romanize=True if lang=='el' else False, - gzip=False, + lower_case=True, gzip=False, verbose=args.verbose, over_write=False) BPEfastApply(cfname + 'tok.' + lang, cfname + 'bpe.' + lang, From 011d0925a31e4e4d6866e4711d41abf84354bc40 Mon Sep 17 00:00:00 2001 From: Proyag Date: Tue, 22 Oct 2019 11:32:04 +0200 Subject: [PATCH 5/5] remove unused argument in BPEfastLoad --- source/lib/text_processing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/lib/text_processing.py b/source/lib/text_processing.py index 204cd848..a079bb63 100644 --- a/source/lib/text_processing.py +++ b/source/lib/text_processing.py @@ -109,7 +109,7 @@ def Token(inp_fname, out_fname, lang='en', # ############################################################################### -def BPEfastLoad(line, bpe_codes): +def BPEfastLoad(bpe_codes): bpe_vocab = bpe_codes.replace('fcodes', 'fvocab') return fastBPE.fastBPE(bpe_codes, bpe_vocab)