From 1b037c26e1bfdc1b4e0065b0bc05cad943008271 Mon Sep 17 00:00:00 2001 From: "Paulo S. Costa" Date: Tue, 9 Jun 2020 11:16:38 -0700 Subject: [PATCH 1/8] Test filters is respected at character level --- tests/text_test.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/text_test.py b/tests/text_test.py index 2c5200ae..371b9005 100644 --- a/tests/text_test.py +++ b/tests/text_test.py @@ -184,6 +184,17 @@ def test_tokenizer_oov_flag_and_num_words(): assert trans_text == 'this ' +def test_tokenizer_filter_char_level(): + """It does not tokenize the characters in ``filters`` when ``char_level=True`` + """ + x_train = ['This text has only known words this text'] + + tokenizer = keras.preprocessing.text.Tokenizer(char_level=True, + filters="e") + tokenizer.fit_on_texts(x_train) + assert "e" not in tokenizer.word_index + + def test_sequences_to_texts_with_num_words_and_oov_token(): x_train = ['This text has only known words this text'] x_test = ['This text has some unknown words'] From 01f1ab75f27d46e73792e9d3bb34412dac742047 Mon Sep 17 00:00:00 2001 From: "Paulo S. Costa" Date: Tue, 9 Jun 2020 13:51:55 -0700 Subject: [PATCH 2/8] Reword doc --- tests/text_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/text_test.py b/tests/text_test.py index 371b9005..baee44e9 100644 --- a/tests/text_test.py +++ b/tests/text_test.py @@ -185,7 +185,7 @@ def test_tokenizer_oov_flag_and_num_words(): def test_tokenizer_filter_char_level(): - """It does not tokenize the characters in ``filters`` when ``char_level=True`` + """It does not tokenize filtered characters at the character level. """ x_train = ['This text has only known words this text'] From 8089e4f22fcdb6d57a865ea3db63ef1544672fbd Mon Sep 17 00:00:00 2001 From: "Paulo S. Costa" Date: Tue, 9 Jun 2020 13:52:59 -0700 Subject: [PATCH 3/8] Use text import --- tests/text_test.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/text_test.py b/tests/text_test.py index baee44e9..4da0598e 100644 --- a/tests/text_test.py +++ b/tests/text_test.py @@ -189,8 +189,7 @@ def test_tokenizer_filter_char_level(): """ x_train = ['This text has only known words this text'] - tokenizer = keras.preprocessing.text.Tokenizer(char_level=True, - filters="e") + tokenizer = text.Tokenizer(filters="e", char_level=True) tokenizer.fit_on_texts(x_train) assert "e" not in tokenizer.word_index From bff28ffd03be1f7ba58c7899a775909976cf5c80 Mon Sep 17 00:00:00 2001 From: "Paulo S. Costa" Date: Tue, 9 Jun 2020 13:53:22 -0700 Subject: [PATCH 4/8] Modify text to not expect . count --- tests/text_test.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/text_test.py b/tests/text_test.py index 4da0598e..f8f6888d 100644 --- a/tests/text_test.py +++ b/tests/text_test.py @@ -294,9 +294,8 @@ def test_tokenizer_lower_flag(): char_tokenizer.fit_on_texts(texts) expected_word_counts = OrderedDict([('t', 11), ('h', 5), ('e', 6), (' ', 14), ('c', 2), ('a', 6), ('s', 2), ('o', 6), - ('n', 4), ('m', 1), ('.', 3), ('d', 3), - ('g', 5), ('l', 2), ('i', 2), ('v', 1), - ('r', 1)]) + ('n', 4), ('m', 1), ('d', 3), ('g', 5), + ('l', 2), ('i', 2), ('v', 1), ('r', 1)]) assert char_tokenizer.word_counts == expected_word_counts From bc1e30f027294c4093cfbd28c2f90fc8cabbb1af Mon Sep 17 00:00:00 2001 From: "Paulo S. Costa" Date: Tue, 9 Jun 2020 14:09:53 -0700 Subject: [PATCH 5/8] Test all possible inputs --- tests/text_test.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/text_test.py b/tests/text_test.py index f8f6888d..048cc1e1 100644 --- a/tests/text_test.py +++ b/tests/text_test.py @@ -184,11 +184,13 @@ def test_tokenizer_oov_flag_and_num_words(): assert trans_text == 'this ' -def test_tokenizer_filter_char_level(): +@pytest.mark.parametrize( + "x_train", + ("ae", ["ae", "er"], [["ae", "er"], ["ee", "a"]]) +) +def test_tokenizer_filter_char_level(x_train): """It does not tokenize filtered characters at the character level. """ - x_train = ['This text has only known words this text'] - tokenizer = text.Tokenizer(filters="e", char_level=True) tokenizer.fit_on_texts(x_train) assert "e" not in tokenizer.word_index From 9b255a1cb9cc731f9c06a5ca6251ce781679b862 Mon Sep 17 00:00:00 2001 From: "Paulo S. Costa" Date: Tue, 9 Jun 2020 14:10:23 -0700 Subject: [PATCH 6/8] Remove filtered characters at char level --- keras_preprocessing/text.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/keras_preprocessing/text.py b/keras_preprocessing/text.py index 5ee23dbd..aca43cc1 100644 --- a/keras_preprocessing/text.py +++ b/keras_preprocessing/text.py @@ -222,9 +222,19 @@ def fit_on_texts(self, texts): a generator of strings (for memory-efficiency), or a list of list of strings. """ + filtered_characters = set(self.filters) for text in texts: self.document_count += 1 if self.char_level or isinstance(text, list): + if isinstance(text, list): + text = ["".join(char + for char in text_elem + if char not in filtered_characters) + for text_elem in text] + else: + text = "".join(char + for char in text + if char not in filtered_characters) if self.lower: if isinstance(text, list): text = [text_elem.lower() for text_elem in text] From 74afe1eba070e6fc990cb94edcfdda7fd411ca4a Mon Sep 17 00:00:00 2001 From: "Paulo S. Costa" Date: Tue, 9 Jun 2020 14:47:32 -0700 Subject: [PATCH 7/8] Don't remove filtered characters on lists elements --- keras_preprocessing/text.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/keras_preprocessing/text.py b/keras_preprocessing/text.py index aca43cc1..40414185 100644 --- a/keras_preprocessing/text.py +++ b/keras_preprocessing/text.py @@ -226,12 +226,7 @@ def fit_on_texts(self, texts): for text in texts: self.document_count += 1 if self.char_level or isinstance(text, list): - if isinstance(text, list): - text = ["".join(char - for char in text_elem - if char not in filtered_characters) - for text_elem in text] - else: + if not isinstance(text, list): text = "".join(char for char in text if char not in filtered_characters) From 489dcfdca680a3fba52a78efc006f41c22755808 Mon Sep 17 00:00:00 2001 From: "Paulo S. Costa" Date: Tue, 9 Jun 2020 14:48:08 -0700 Subject: [PATCH 8/8] Remove test on lists of lists --- tests/text_test.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/text_test.py b/tests/text_test.py index 048cc1e1..97e66ebc 100644 --- a/tests/text_test.py +++ b/tests/text_test.py @@ -184,10 +184,7 @@ def test_tokenizer_oov_flag_and_num_words(): assert trans_text == 'this ' -@pytest.mark.parametrize( - "x_train", - ("ae", ["ae", "er"], [["ae", "er"], ["ee", "a"]]) -) +@pytest.mark.parametrize("x_train", ("ae", ["ae", "er"])) def test_tokenizer_filter_char_level(x_train): """It does not tokenize filtered characters at the character level. """