-
Notifications
You must be signed in to change notification settings - Fork 79
Open
Description
Tower-Plus-9B, and I think that also other LLMs, use some special tags in outputs: Borchmann und Lukasz Garncarek durchgef\u00fchrt wurde.<end_of_turn>.
The tag <end_of_turn> should not be there, and there should be a systematic solution that works for any LLM.
Command:
simulstreaming_translate.py --input-jsonl ../../inputs/en//acl6060.ts/gold-jsonl/2022.acl-long.590.jsonl --tgt-lan de --src-lan en --model-dir ct2_Tower-Plus-9B --tokenizer-dir Tower-Plus-9B --max-context-length 200 --min-chunk-size 5 --comp_unaware
Part of output:
{"emission_time": 1.422, "end": 1.422, "status": "COMPLETE", "text": "", "unconfirmed_text": "", "is_final": false}
{"emission_time": 1.742, "end": 1.742, "status": "COMPLETE", "text": "", "unconfirmed_text": "", "is_final": false}
{"emission_time": 2.002, "end": 2.002, "status": "COMPLETE", "text": "", "unconfirmed_text": "", "is_final": false}
{"emission_time": 2.132, "end": 2.132, "status": "COMPLETE", "text": "", "unconfirmed_text": "", "is_final": false}
{"emission_time": 2.392, "end": 2.392, "status": "INCOMPLETE", "text": "", "unconfirmed_text": "Hallo, mein Name ist Micha\u0142<end_of_turn>", "is_final": false}
{"emission_time": 2.392, "end": 2.392, "status": "COMPLETE", "text": "", "unconfirmed_text": "Hallo, mein Name ist Micha\u0142<end_of_turn>", "is_final": false}
{"emission_time": 3.282, "end": 3.282, "status": "COMPLETE", "text": "", "unconfirmed_text": "Hallo, mein Name ist Micha\u0142<end_of_turn>", "is_final": false}
{"emission_time": 3.442, "end": 3.442, "status": "COMPLETE", "text": "", "unconfirmed_text": "Hallo, mein Name ist Micha\u0142<end_of_turn>", "is_final": false}
{"emission_time": 3.5820000000000003, "end": 3.5820000000000003, "status": "COMPLETE", "text": "", "unconfirmed_text": "Hallo, mein Name ist Micha\u0142<end_of_turn>", "is_final": false}
{"emission_time": 3.722, "end": 3.722, "status": "COMPLETE", "text": "", "unconfirmed_text": "Hallo, mein Name ist Micha\u0142<end_of_turn>", "is_final": false}
{"emission_time": 3.852, "end": 3.852, "status": "INCOMPLETE", "text": " Hallo, mein Name ist", "unconfirmed_text": "Micha\u0142 Pietruszka und ich bin...<end_of_turn>", "is_final": false}
{"emission_time": 3.852, "end": 3.852, "status": "COMPLETE", "text": " Hallo, mein Name ist", "unconfirmed_text": "Micha\u0142 Pietruszka und ich bin...<end_of_turn>", "is_final": false}
{"emission_time": 4.322, "end": 4.322, "status": "COMPLETE", "text": "", "unconfirmed_text": "Micha\u0142 Pietruszka und ich bin...<end_of_turn>", "is_final": false}
{"emission_time": 4.4719999999999995, "end": 4.4719999999999995, "status": "COMPLETE", "text": "", "unconfirmed_text": "Micha\u0142 Pietruszka und ich bin...<end_of_turn>", "is_final": false}
{"emission_time": 4.962, "end": 4.962, "status": "COMPLETE", "text": "", "unconfirmed_text": "Micha\u0142 Pietruszka und ich bin...<end_of_turn>", "is_final": false}
{"emission_time": 5.052, "end": 5.052, "status": "COMPLETE", "text": "", "unconfirmed_text": "Micha\u0142 Pietruszka und ich bin...<end_of_turn>", "is_final": false}
{"emission_time": 5.162, "end": 5.162, "status": "INCOMPLETE", "text": " Micha\u0142 Pietruszka und", "unconfirmed_text": "es ist mir eine Freude, Ihnen...<end_of_turn>", "is_final": false}
{"emission_time": 5.162, "end": 5.162, "status": "COMPLETE", "text": " Micha\u0142 Pietruszka und", "unconfirmed_text": "es ist mir eine Freude, Ihnen...<end_of_turn>", "is_final": false}
{"emission_time": 5.302, "end": 5.302, "status": "COMPLETE", "text": "", "unconfirmed_text": "es ist mir eine Freude, Ihnen...<end_of_turn>", "is_final": false}
{"emission_time": 5.802, "end": 5.802, "status": "COMPLETE", "text": "", "unconfirmed_text": "es ist mir eine Freude, Ihnen...<end_of_turn>", "is_final": false}
{"emission_time": 6.292, "end": 6.292, "status": "COMPLETE", "text": "", "unconfirmed_text": "es ist mir eine Freude, Ihnen...<end_of_turn>", "is_final": false}
{"emission_time": 7.412, "end": 7.412, "status": "COMPLETE", "text": "", "unconfirmed_text": "es ist mir eine Freude, Ihnen...<end_of_turn>", "is_final": false}
{"emission_time": 8.042, "end": 8.042, "status": "INCOMPLETE", "text": " es ist mir eine Freude,", "unconfirmed_text": "Ihnen den Vortrag mit dem Titel \u201eSparsifying Transformer\u201c pr\u00e4sentieren zu d\u00fcrfen.<end_of_turn>", "is_final": false}
{"emission_time": 8.042, "end": 8.042, "status": "COMPLETE", "text": " es ist mir eine Freude,", "unconfirmed_text": "Ihnen den Vortrag mit dem Titel \u201eSparsifying Transformer\u201c pr\u00e4sentieren zu d\u00fcrfen.<end_of_turn>", "is_final": false}
{"emission_time": 8.542, "end": 8.542, "status": "COMPLETE", "text": "", "unconfirmed_text": "Ihnen den Vortrag mit dem Titel \u201eSparsifying Transformer\u201c pr\u00e4sentieren zu d\u00fcrfen.<end_of_turn>", "is_final": false}
{"emission_time": 8.732, "end": 8.732, "status": "COMPLETE", "text": "", "unconfirmed_text": "Ihnen den Vortrag mit dem Titel \u201eSparsifying Transformer\u201c pr\u00e4sentieren zu d\u00fcrfen.<end_of_turn>", "is_final": false}
{"emission_time": 9.272, "end": 9.272, "status": "COMPLETE", "text": "", "unconfirmed_text": "Ihnen den Vortrag mit dem Titel \u201eSparsifying Transformer\u201c pr\u00e4sentieren zu d\u00fcrfen.<end_of_turn>", "is_final": false}
{"emission_time": 10.052000000000001, "end": 10.052000000000001, "status": "COMPLETE", "text": "", "unconfirmed_text": "Ihnen den Vortrag mit dem Titel \u201eSparsifying Transformer\u201c pr\u00e4sentieren zu d\u00fcrfen.<end_of_turn>", "is_final": false}
{"emission_time": 10.572000000000001, "end": 10.572000000000001, "status": "INCOMPLETE", "text": " Ihnen den Vortrag mit dem Titel \u201eSparsifying", "unconfirmed_text": "Transformer Models with Trainable Representation Pooling\u201c zu pr\u00e4sentieren.<end_of_turn>", "is_final": false}
{"emission_time": 10.572000000000001, "end": 10.572000000000001, "status": "COMPLETE", "text": " Ihnen den Vortrag mit dem Titel \u201eSparsifying", "unconfirmed_text": "Transformer Models with Trainable Representation Pooling\u201c zu pr\u00e4sentieren.<end_of_turn>", "is_final": false}
{"emission_time": 10.572000000000001, "end": 10.572000000000001, "status": "COMPLETE", "text": "Transformer Models with Trainable Representation Pooling\u201c zu pr\u00e4sentieren.<end_of_turn>", "unconfirmed_text": "", "is_final": true}
{"emission_time": 11.188, "end": 11.188, "status": "COMPLETE", "text": "", "unconfirmed_text": "", "is_final": false}
{"emission_time": 11.488, "end": 11.488, "status": "COMPLETE", "text": "", "unconfirmed_text": "", "is_final": false}
{"emission_time": 11.718, "end": 11.718, "status": "COMPLETE", "text": "", "unconfirmed_text": "", "is_final": false}
{"emission_time": 12.138, "end": 12.138, "status": "COMPLETE", "text": "", "unconfirmed_text": "", "is_final": false}
{"emission_time": 12.338000000000001, "end": 12.338000000000001, "status": "INCOMPLETE", "text": "", "unconfirmed_text": "Eine Arbeit, die bei Applica durchgef\u00fchrt wurde<end_of_turn>", "is_final": false}
{"emission_time": 12.338000000000001, "end": 12.338000000000001, "status": "COMPLETE", "text": "", "unconfirmed_text": "Eine Arbeit, die bei Applica durchgef\u00fchrt wurde<end_of_turn>", "is_final": false}
{"emission_time": 12.778, "end": 12.778, "status": "COMPLETE", "text": "", "unconfirmed_text": "Eine Arbeit, die bei Applica durchgef\u00fchrt wurde<end_of_turn>", "is_final": false}
{"emission_time": 13.058, "end": 13.058, "status": "COMPLETE", "text": "", "unconfirmed_text": "Eine Arbeit, die bei Applica durchgef\u00fchrt wurde<end_of_turn>", "is_final": false}
{"emission_time": 13.838000000000001, "end": 13.838000000000001, "status": "COMPLETE", "text": "", "unconfirmed_text": "Eine Arbeit, die bei Applica durchgef\u00fchrt wurde<end_of_turn>", "is_final": false}
{"emission_time": 13.968, "end": 13.968, "status": "COMPLETE", "text": "", "unconfirmed_text": "Eine Arbeit, die bei Applica durchgef\u00fchrt wurde<end_of_turn>", "is_final": false}
{"emission_time": 14.488, "end": 14.488, "status": "INCOMPLETE", "text": " Eine Arbeit, die bei Applica", "unconfirmed_text": "AI in Zusammenarbeit mit Lukasz durchgef\u00fchrt wurde.<end_of_turn>", "is_final": false}
{"emission_time": 14.488, "end": 14.488, "status": "COMPLETE", "text": " Eine Arbeit, die bei Applica", "unconfirmed_text": "AI in Zusammenarbeit mit Lukasz durchgef\u00fchrt wurde.<end_of_turn>", "is_final": false}
{"emission_time": 15.238, "end": 15.238, "status": "COMPLETE", "text": "", "unconfirmed_text": "AI in Zusammenarbeit mit Lukasz durchgef\u00fchrt wurde.<end_of_turn>", "is_final": false}
{"emission_time": 15.408000000000001, "end": 15.408000000000001, "status": "COMPLETE", "text": "", "unconfirmed_text": "AI in Zusammenarbeit mit Lukasz durchgef\u00fchrt wurde.<end_of_turn>", "is_final": false}
{"emission_time": 15.708, "end": 15.708, "status": "COMPLETE", "text": "", "unconfirmed_text": "AI in Zusammenarbeit mit Lukasz durchgef\u00fchrt wurde.<end_of_turn>", "is_final": false}
{"emission_time": 16.498, "end": 16.498, "status": "INCOMPLETE", "text": " AI in Zusammenarbeit mit Lukasz", "unconfirmed_text": "Borchmann und Lukasz Garncarek durchgef\u00fchrt wurde.<end_of_turn>", "is_final": false}
{"emission_time": 16.498, "end": 16.498, "status": "COMPLETE", "text": " AI in Zusammenarbeit mit Lukasz", "unconfirmed_text": "Borchmann und Lukasz Garncarek durchgef\u00fchrt wurde.<end_of_turn>", "is_final": false}
{"emission_time": 16.498, "end": 16.498, "status": "COMPLETE", "text": "Borchmann und Lukasz Garncarek durchgef\u00fchrt wurde.<end_of_turn>", "unconfirmed_text": "", "is_final": true}
Part of input:
{"start": 1.012, "end": 1.422, "text": " Hello,", "emission_time": 1.422, "is_final": false}
{"start": 1.612, "end": 1.742, "text": " my", "emission_time": 1.742, "is_final": false}
{"start": 1.742, "end": 2.002, "text": " name", "emission_time": 2.002, "is_final": false}
{"start": 2.002, "end": 2.132, "text": " is", "emission_time": 2.132, "is_final": false}
{"start": 2.132, "end": 2.392, "text": " Micha\u0142", "emission_time": 2.392, "is_final": false}
{"start": 2.392, "end": 3.282, "text": " Pietruszka", "emission_time": 3.282, "is_final": false}
{"start": 3.282, "end": 3.442, "text": " and", "emission_time": 3.442, "is_final": false}
{"start": 3.442, "end": 3.5820000000000003, "text": " it", "emission_time": 3.5820000000000003, "is_final": false}
{"start": 3.5820000000000003, "end": 3.722, "text": " is", "emission_time": 3.722, "is_final": false}
{"start": 3.722, "end": 3.852, "text": " my", "emission_time": 3.852, "is_final": false}
machacek@sol2:~/work/uedin/iwslt26-sst/systems/baseline-SimulStreaming$ head ../../inputs/en//acl6060.ts/gold-jsonl/2022.acl-long.590.jsonl -n 100
{"start": 1.012, "end": 1.422, "text": " Hello,", "emission_time": 1.422, "is_final": false}
{"start": 1.612, "end": 1.742, "text": " my", "emission_time": 1.742, "is_final": false}
{"start": 1.742, "end": 2.002, "text": " name", "emission_time": 2.002, "is_final": false}
{"start": 2.002, "end": 2.132, "text": " is", "emission_time": 2.132, "is_final": false}
{"start": 2.132, "end": 2.392, "text": " Micha\u0142", "emission_time": 2.392, "is_final": false}
{"start": 2.392, "end": 3.282, "text": " Pietruszka", "emission_time": 3.282, "is_final": false}
{"start": 3.282, "end": 3.442, "text": " and", "emission_time": 3.442, "is_final": false}
{"start": 3.442, "end": 3.5820000000000003, "text": " it", "emission_time": 3.5820000000000003, "is_final": false}
{"start": 3.5820000000000003, "end": 3.722, "text": " is", "emission_time": 3.722, "is_final": false}
{"start": 3.722, "end": 3.852, "text": " my", "emission_time": 3.852, "is_final": false}
{"start": 3.852, "end": 4.322, "text": " pleasure", "emission_time": 4.322, "is_final": false}
{"start": 4.322, "end": 4.4719999999999995, "text": " to", "emission_time": 4.4719999999999995, "is_final": false}
{"start": 4.4719999999999995, "end": 4.962, "text": " present", "emission_time": 4.962, "is_final": false}
{"start": 4.962, "end": 5.052, "text": " to", "emission_time": 5.052, "is_final": false}
{"start": 5.052, "end": 5.162, "text": " you", "emission_time": 5.162, "is_final": false}
{"start": 5.162, "end": 5.302, "text": " the", "emission_time": 5.302, "is_final": false}
{"start": 5.302, "end": 5.802, "text": " paper", "emission_time": 5.802, "is_final": false}
{"start": 5.802, "end": 6.292, "text": " titled", "emission_time": 6.292, "is_final": false}
{"start": 6.762, "end": 7.412, "text": " Sparsifying", "emission_time": 7.412, "is_final": false}
{"start": 7.412, "end": 8.042, "text": " Transformer", "emission_time": 8.042, "is_final": false}
{"start": 8.042, "end": 8.542, "text": " Models", "emission_time": 8.542, "is_final": false}
{"start": 8.542, "end": 8.732, "text": " with", "emission_time": 8.732, "is_final": false}
{"start": 8.732, "end": 9.272, "text": " Trainable", "emission_time": 9.272, "is_final": false}
{"start": 9.272, "end": 10.052000000000001, "text": " Representation", "emission_time": 10.052000000000001, "is_final": false}
{"start": 10.052000000000001, "end": 10.572000000000001, "text": " Pooling.", "emission_time": 10.572000000000001, "is_final": true}
{"start": 11.048, "end": 11.188, "text": " A", "emission_time": 11.188, "is_final": false}
{"start": 11.188, "end": 11.488, "text": " work", "emission_time": 11.488, "is_final": false}
{"start": 11.488, "end": 11.718, "text": " done", "emission_time": 11.718, "is_final": false}
{"start": 11.718, "end": 12.138, "text": " at", "emission_time": 12.138, "is_final": false}
{"start": 12.138, "end": 12.338000000000001, "text": " Applica", "emission_time": 12.338000000000001, "is_final": false}
{"start": 12.338000000000001, "end": 12.778, "text": " AI", "emission_time": 12.778, "is_final": false}
{"start": 12.778, "end": 13.058, "text": " in", "emission_time": 13.058, "is_final": false}
{"start": 13.058, "end": 13.838000000000001, "text": " cooperation", "emission_time": 13.838000000000001, "is_final": false}
{"start": 13.838000000000001, "end": 13.968, "text": " with", "emission_time": 13.968, "is_final": false}
{"start": 13.968, "end": 14.488, "text": " Lukasz", "emission_time": 14.488, "is_final": false}
{"start": 14.648, "end": 15.238, "text": " Borchmann", "emission_time": 15.238, "is_final": false}
{"start": 15.238, "end": 15.408000000000001, "text": " and", "emission_time": 15.408000000000001, "is_final": false}
{"start": 15.408000000000001, "end": 15.708, "text": " Lukasz", "emission_time": 15.708, "is_final": false}
{"start": 15.968, "end": 16.498, "text": " Garncarek.", "emission_time": 16.498, "is_final": true}
{"start": 18.122, "end": 18.262, "text": " Let", "emission_time": 18.262, "is_final": false}
{"start": 18.262, "end": 18.372, "text": " me", "emission_time": 18.372, "is_final": false}
{"start": 18.372, "end": 18.852, "text": " start", "emission_time": 18.852, "is_final": false}
{"start": 18.852, "end": 19.061999999999998, "text": " with", "emission_time": 19.061999999999998, "is_final": false}
{"start": 19.061999999999998, "end": 19.142, "text": " the", "emission_time": 19.142, "is_final": false}
{"start": 19.142, "end": 19.762, "text": " problems", "emission_time": 19.762, "is_final": false}
{"start": 19.762, "end": 19.962, "text": " our", "emission_time": 19.962, "is_final": false}
{"start": 19.962, "end": 20.332, "text": " work", "emission_time": 20.332, "is_final": false}
{"start": 20.332, "end": 20.942, "text": " targets.", "emission_time": 20.942, "is_final": true}
{"start": 21.709, "end": 21.949, "text": " Our", "emission_time": 21.949, "is_final": false}
{"start": 21.949, "end": 22.389, "text": " method", "emission_time": 22.389, "is_final": false}
{"start": 22.389, "end": 22.769, "text": " works", "emission_time": 22.769, "is_final": false}
{"start": 22.769, "end": 23.059, "text": " well", "emission_time": 23.059, "is_final": false}
{"start": 23.059, "end": 23.249, "text": " for", "emission_time": 23.249, "is_final": false}
{"start": 23.249, "end": 23.349, "text": " the", "emission_time": 23.349, "is_final": false}
{"start": 23.349, "end": 23.829, "text": " cases", "emission_time": 23.829, "is_final": false}
{"start": 23.829, "end": 24.079, "text": " where", "emission_time": 24.079, "is_final": false}
{"start": 24.079, "end": 24.339, "text": " long", "emission_time": 24.339, "is_final": false}
{"start": 24.339, "end": 24.849, "text": " inputs", "emission_time": 24.849, "is_final": false}
{"start": 24.849, "end": 24.879, "text": " are", "emission_time": 24.879, "is_final": false}
{"start": 24.879, "end": 25.549, "text": " considered.", "emission_time": 25.549, "is_final": true}
{"start": 26.042, "end": 26.362, "text": " Roughly", "emission_time": 26.362, "is_final": false}
{"start": 26.362, "end": 26.962, "text": " speaking,", "emission_time": 26.962, "is_final": false}
{"start": 26.962, "end": 27.092, "text": " it", "emission_time": 27.092, "is_final": false}
{"start": 27.092, "end": 27.212, "text": " is", "emission_time": 27.212, "is_final": false}
{"start": 27.212, "end": 27.432, "text": " meant", "emission_time": 27.432, "is_final": false}
{"start": 27.432, "end": 27.592, "text": " for", "emission_time": 27.592, "is_final": false}
{"start": 27.592, "end": 27.701999999999998, "text": " the", "emission_time": 27.701999999999998, "is_final": false}
{"start": 27.701999999999998, "end": 28.072, "text": " task", "emission_time": 28.072, "is_final": false}
{"start": 28.072, "end": 28.592, "text": " orders", "emission_time": 28.592, "is_final": false}
{"start": 28.592, "end": 28.792, "text": " and", "emission_time": 28.792, "is_final": false}
{"start": 28.792, "end": 29.142, "text": " input", "emission_time": 29.142, "is_final": false}
{"start": 29.142, "end": 29.322, "text": " of", "emission_time": 29.322, "is_final": false}
{"start": 29.322, "end": 29.582, "text": " over", "emission_time": 29.582, "is_final": false}
{"start": 29.582, "end": 29.782, "text": " two", "emission_time": 29.782, "is_final": false}
{"start": 29.782, "end": 30.192, "text": " thousand", "emission_time": 30.192, "is_final": false}
{"start": 30.192, "end": 30.672, "text": " tokens", "emission_time": 30.672, "is_final": false}
{"start": 30.672, "end": 30.842, "text": " and", "emission_time": 30.842, "is_final": false}
{"start": 30.842, "end": 30.922, "text": " the", "emission_time": 30.922, "is_final": false}
{"start": 30.922, "end": 31.392, "text": " targets", "emission_time": 31.392, "is_final": false}
{"start": 31.392, "end": 31.532, "text": " are", "emission_time": 31.532, "is_final": false}
{"start": 31.532, "end": 32.202, "text": " shorter", "emission_time": 32.202, "is_final": false}
{"start": 32.542, "end": 32.692, "text": " than", "emission_time": 32.692, "is_final": false}
{"start": 32.692, "end": 32.782, "text": " the", "emission_time": 32.782, "is_final": false}
{"start": 32.782, "end": 33.252, "text": " provided", "emission_time": 33.252, "is_final": false}
{"start": 33.252, "end": 33.782, "text": " inputs.", "emission_time": 33.782, "is_final": true}
{"start": 35.655, "end": 35.845000000000006, "text": " This", "emission_time": 35.845000000000006, "is_final": false}
{"start": 35.845000000000006, "end": 35.995000000000005, "text": " has", "emission_time": 35.995000000000005, "is_final": false}
{"start": 35.995000000000005, "end": 36.23500000000001, "text": " some", "emission_time": 36.23500000000001, "is_final": false}
{"start": 36.23500000000001, "end": 36.745000000000005, "text": " specific", "emission_time": 36.745000000000005, "is_final": false}
{"start": 36.745000000000005, "end": 37.55500000000001, "text": " applications", "emission_time": 37.55500000000001, "is_final": false}
{"start": 37.55500000000001, "end": 37.695, "text": " in", "emission_time": 37.695, "is_final": false}
{"start": 37.695, "end": 38.332, "text": " NLP.", "emission_time": 38.332, "is_final": true}
{"start": 38.683, "end": 38.833, "text": " For", "emission_time": 38.833, "is_final": false}
{"start": 38.833, "end": 39.312999999999995, "text": " example,", "emission_time": 39.312999999999995, "is_final": false}
{"start": 39.312999999999995, "end": 39.492999999999995, "text": " one", "emission_time": 39.492999999999995, "is_final": false}
{"start": 39.492999999999995, "end": 39.793, "text": " can", "emission_time": 39.793, "is_final": false}
{"start": 39.793, "end": 40.003, "text": " imagine", "emission_time": 40.003, "is_final": false}
{"start": 40.092999999999996, "end": 40.893, "text": " that", "emission_time": 40.893, "is_final": false}
{"start": 40.893, "end": 41.213, "text": " given", "emission_time": 41.213, "is_final": false}
{"start": 41.213, "end": 41.253, "text": " a", "emission_time": 41.253, "is_final": false}
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels