Add machine translation

yinyou · Dec 23, 2016 · 478db44 · 478db44
1 parent 0a0e290
commit 478db44
Show file tree

Hide file tree

Showing 15 changed files with 996 additions and 1 deletion.
diff --git a/machine_translation/.gitignore b/machine_translation/.gitignore
@@ -0,0 +1,8 @@
+data/wmt14
+data/pre-wmt14
+pretrained/wmt14_model
+gen.log
+gen_result
+train.log
+dataprovider_copy_1.py
+*.pyc
diff --git a/machine_translation/README.md b/machine_translation/README.md
diff --git a/machine_translation/data/wmt14_data.sh b/machine_translation/data/wmt14_data.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+set -x
+mkdir wmt14
+cd wmt14
+
+# download the dataset
+wget http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz
+wget http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz
+
+# untar the dataset
+tar -zxvf bitexts.tgz
+tar -zxvf dev+test.tgz
+gunzip bitexts.selected/*
+mv bitexts.selected train
+rm bitexts.tgz
+rm dev+test.tgz
+
+# separate the dev and test dataset
+mkdir test gen
+mv dev/ntst1213.* test
+mv dev/ntst14.* gen 
+rm -rf dev
+
+set +x
+# rename the suffix, .fr->.src, .en->.trg
+for dir in train test gen
+do 
+  filelist=`ls $dir`
+  cd $dir
+  for file in $filelist
+  do 
+    if [ ${file##*.} = "fr" ]; then
+      mv $file ${file/%fr/src}
+    elif [ ${file##*.} = 'en' ]; then
+      mv $file ${file/%en/trg}
+    fi
+  done
+  cd ..
+done
diff --git a/machine_translation/dataprovider.py b/machine_translation/dataprovider.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer.PyDataProvider2 import *
+
+UNK_IDX = 2
+START = "<s>"
+END = "<e>"
+
+
+def hook(settings, src_dict_path, trg_dict_path, is_generating, file_list,
+         **kwargs):
+    # job_mode = 1: training mode
+    # job_mode = 0: generating mode
+    settings.job_mode = not is_generating
+
+    def fun(dict_path):
+        out_dict = dict()
+        with open(dict_path, "r") as fin:
+            out_dict = {
+                line.strip(): line_count
+                for line_count, line in enumerate(fin)
+            }
+        return out_dict
+
+    settings.src_dict = fun(src_dict_path)
+    settings.trg_dict = fun(trg_dict_path)
+
+    settings.logger.info("src dict len : %d" % (len(settings.src_dict)))
+
+    if settings.job_mode:
+        settings.slots = {
+            'source_language_word':
+            integer_value_sequence(len(settings.src_dict)),
+            'target_language_word':
+            integer_value_sequence(len(settings.trg_dict)),
+            'target_language_next_word':
+            integer_value_sequence(len(settings.trg_dict))
+        }
+        settings.logger.info("trg dict len : %d" % (len(settings.trg_dict)))
+    else:
+        settings.slots = {
+            'source_language_word':
+            integer_value_sequence(len(settings.src_dict)),
+            'sent_id':
+            integer_value_sequence(len(open(file_list[0], "r").readlines()))
+        }
+
+
+def _get_ids(s, dictionary):
+    words = s.strip().split()
+    return [dictionary[START]] + \
+           [dictionary.get(w, UNK_IDX) for w in words] + \
+           [dictionary[END]]
+
+
+@provider(init_hook=hook, pool_size=50000)
+def process(settings, file_name):
+    with open(file_name, 'r') as f:
+        for line_count, line in enumerate(f):
+            line_split = line.strip().split('\t')
+            if settings.job_mode and len(line_split) != 2:
+                continue
+            src_seq = line_split[0]  # one source sequence
+            src_ids = _get_ids(src_seq, settings.src_dict)
+
+            if settings.job_mode:
+                trg_seq = line_split[1]  # one target sequence
+                trg_words = trg_seq.split()
+                trg_ids = [settings.trg_dict.get(w, UNK_IDX) for w in trg_words]
+
+                # remove sequence whose length > 80 in training mode
+                if len(src_ids) > 80 or len(trg_ids) > 80:
+                    continue
+                trg_ids_next = trg_ids + [settings.trg_dict[END]]
+                trg_ids = [settings.trg_dict[START]] + trg_ids
+                yield {
+                    'source_language_word': src_ids,
+                    'target_language_word': trg_ids,
+                    'target_language_next_word': trg_ids_next
+                }
+            else:
+                yield {'source_language_word': src_ids, 'sent_id': [line_count]}
diff --git a/machine_translation/gen.sh b/machine_translation/gen.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+paddle train \
+    --job=test \
+    --config='seqToseq_net.py' \
+    --save_dir='pretrained/wmt14_model' \
+    --use_gpu=false \
+    --num_passes=13 \
+    --test_pass=12 \
+    --trainer_count=1 \
+    --config_args=is_generating=1,gen_trans_file="gen_result" \
+    2>&1 | tee 'gen.log'
diff --git a/machine_translation/image/encoder_attention.PNG b/machine_translation/image/encoder_attention.PNG
diff --git a/machine_translation/image/encoder_attention.png b/machine_translation/image/encoder_attention.png
diff --git a/machine_translation/image/encoder_decoder.png b/machine_translation/image/encoder_decoder.png
diff --git a/machine_translation/image/gru.PNG b/machine_translation/image/gru.PNG
diff --git a/machine_translation/image/gru.png b/machine_translation/image/gru.png
diff --git a/machine_translation/image/nmt.png b/machine_translation/image/nmt.png
diff --git a/machine_translation/moses_bleu.sh b/machine_translation/moses_bleu.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+set -x
+echo "Downloading multi-bleu.perl"
+wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/generic/multi-bleu.perl --no-check-certificate
diff --git a/machine_translation/pretrained/wmt14_model.sh b/machine_translation/pretrained/wmt14_model.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+set -x
+
+# download the pretrained model
+wget http://paddlepaddle.bj.bcebos.com/model_zoo/wmt14_model.tar.gz
+
+# untar the model
+tar -zxvf wmt14_model.tar.gz
+rm wmt14_model.tar.gz 
diff --git a/machine_translation/seqToseq_net.py b/machine_translation/seqToseq_net.py
@@ -0,0 +1,168 @@
+# edit-mode: -*- python -*-
+
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from paddle.trainer_config_helpers import *
+
+### Data Definiation
+data_dir = "./data/pre-wmt14"
+src_lang_dict = os.path.join(data_dir, 'src.dict')
+trg_lang_dict = os.path.join(data_dir, 'trg.dict')
+is_generating = get_config_arg("is_generating", bool, False)
+
+if not is_generating:
+    train_list = os.path.join(data_dir, 'train.list')
+    test_list = os.path.join(data_dir, 'test.list')
+else:
+    train_list = None
+    test_list = os.path.join(data_dir, 'gen.list')
+
+define_py_data_sources2(
+    train_list,
+    test_list,
+    module="dataprovider",
+    obj="process",
+    args={
+        "src_dict_path": src_lang_dict,
+        "trg_dict_path": trg_lang_dict,
+        "is_generating": is_generating
+    })
+
+### Algorithm Configuration
+settings(
+    learning_method = AdamOptimizer(),
+    batch_size = 50 if not is_generating else 1,
+    learning_rate = 5e-4 if not is_generating else 0)
+
+### Network Architecture
+source_dict_dim = len(open(src_lang_dict, "r").readlines())
+target_dict_dim = len(open(trg_lang_dict, "r").readlines())
+word_vector_dim = 512 # dimension of word vector
+decoder_size = 512 # dimension of hidden unit in GRU Decoder network
+encoder_size = 512 # dimension of hidden unit in GRU Encoder network
+
+if is_generating:
+    beam_size=3 # expand width in beam search
+    max_length=250 # a stop condition of sequence generation
+    gen_trans_file = get_config_arg("gen_trans_file", str, None)
+
+#### Encoder
+src_word_id = data_layer(name='source_language_word', size=source_dict_dim)
+src_embedding = embedding_layer(
+    input=src_word_id,
+    size=word_vector_dim,
+    param_attr=ParamAttr(name='_source_language_embedding'))
+src_forward = simple_gru(input=src_embedding, size=encoder_size)
+src_backward = simple_gru(
+    input=src_embedding, size=encoder_size, reverse=True)
+encoded_vector = concat_layer(input=[src_forward, src_backward])
+
+with mixed_layer(size=decoder_size) as encoded_proj:
+    encoded_proj += full_matrix_projection(input=encoded_vector)
+
+backward_first = first_seq(input=src_backward)
+with mixed_layer(
+        size=decoder_size,
+        act=TanhActivation(), ) as decoder_boot:
+    decoder_boot += full_matrix_projection(input=backward_first)
+
+#### Decoder
+def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
+    decoder_mem = memory(
+        name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
+
+    context = simple_attention(
+        encoded_sequence=enc_vec,
+        encoded_proj=enc_proj,
+        decoder_state=decoder_mem, )
+
+    with mixed_layer(size=decoder_size * 3) as decoder_inputs:
+        decoder_inputs += full_matrix_projection(input=context)
+        decoder_inputs += full_matrix_projection(input=current_word)
+
+    gru_step = gru_step_layer(
+        name='gru_decoder',
+        input=decoder_inputs,
+        output_mem=decoder_mem,
+        size=decoder_size)
+
+    with mixed_layer(
+            size=target_dict_dim, bias_attr=True,
+            act=SoftmaxActivation()) as out:
+        out += full_matrix_projection(input=gru_step)
+    return out
+
+decoder_group_name = "decoder_group"
+group_inputs = [
+    StaticInput(
+        input=encoded_vector, is_seq=True), StaticInput(
+            input=encoded_proj, is_seq=True)
+]
+
+if not is_generating:
+    trg_embedding = embedding_layer(
+        input=data_layer(
+            name='target_language_word', size=target_dict_dim),
+        size=word_vector_dim,
+        param_attr=ParamAttr(name='_target_language_embedding'))
+    group_inputs.append(trg_embedding)
+
+    # For decoder equipped with attention mechanism, in training,
+    # target embeding (the groudtruth) is the data input,
+    # while encoded source sequence is accessed to as an unbounded memory.
+    # Here, the StaticInput defines a read-only memory
+    # for the recurrent_group.
+    decoder = recurrent_group(
+        name=decoder_group_name,
+        step=gru_decoder_with_attention,
+        input=group_inputs)
+
+    lbl = data_layer(name='target_language_next_word', size=target_dict_dim)
+    cost = classification_cost(input=decoder, label=lbl)
+    outputs(cost)
+else:
+    # In generation, the decoder predicts a next target word based on
+    # the encoded source sequence and the last generated target word.
+
+    # The encoded source sequence (encoder's output) must be specified by
+    # StaticInput, which is a read-only memory.
+    # Embedding of the last generated word is automatically gotten by
+    # GeneratedInputs, which is initialized by a start mark, such as <s>,
+    # and must be included in generation.
+
+
+    trg_embedding = GeneratedInput(
+        size=target_dict_dim,
+        embedding_name='_target_language_embedding',
+        embedding_size=word_vector_dim)
+    group_inputs.append(trg_embedding)
+
+    beam_gen = beam_search(
+        name=decoder_group_name,
+        step=gru_decoder_with_attention,
+        input=group_inputs,
+        bos_id=0,
+        eos_id=1,
+        beam_size=beam_size,
+        max_length=max_length)
+
+    seqtext_printer_evaluator(
+        input=beam_gen,
+        id_input=data_layer(
+            name="sent_id", size=1),
+        dict_file=trg_lang_dict,
+        result_file=gen_trans_file)
+    outputs(beam_gen)