From 1b037c26e1bfdc1b4e0065b0bc05cad943008271 Mon Sep 17 00:00:00 2001
From: "Paulo S. Costa" <pauloscosta5@gmail.com>
Date: Tue, 9 Jun 2020 11:16:38 -0700
Subject: [PATCH 1/8] Test filters is respected at character level

---
 tests/text_test.py | 11 +++++++++++
 1 file changed, 11 insertions(+)
diff --git a/tests/text_test.py b/tests/text_test.py
index 2c5200ae..371b9005 100644
--- a/tests/text_test.py
+++ b/tests/text_test.py
@@ -184,6 +184,17 @@ def test_tokenizer_oov_flag_and_num_words():
     assert trans_text == 'this <unk> <unk> <unk> <unk> <unk>'
 
 
+def test_tokenizer_filter_char_level():
+    """It does not tokenize the characters in ``filters`` when ``char_level=True``
+    """
+    x_train = ['This text has only known words this text']
+
+    tokenizer = keras.preprocessing.text.Tokenizer(char_level=True,
+                                                   filters="e")
+    tokenizer.fit_on_texts(x_train)
+    assert "e" not in tokenizer.word_index
+
+
 def test_sequences_to_texts_with_num_words_and_oov_token():
     x_train = ['This text has only known words this text']
     x_test = ['This text has some unknown words']

From 01f1ab75f27d46e73792e9d3bb34412dac742047 Mon Sep 17 00:00:00 2001
From: "Paulo S. Costa" <pauloscosta5@gmail.com>
Date: Tue, 9 Jun 2020 13:51:55 -0700
Subject: [PATCH 2/8] Reword doc

---
 tests/text_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/text_test.py b/tests/text_test.py
index 371b9005..baee44e9 100644
--- a/tests/text_test.py
+++ b/tests/text_test.py
@@ -185,7 +185,7 @@ def test_tokenizer_oov_flag_and_num_words():
 
 
 def test_tokenizer_filter_char_level():
-    """It does not tokenize the characters in ``filters`` when ``char_level=True``
+    """It does not tokenize filtered characters at the character level.
     """
     x_train = ['This text has only known words this text']
 

From 8089e4f22fcdb6d57a865ea3db63ef1544672fbd Mon Sep 17 00:00:00 2001
From: "Paulo S. Costa" <pauloscosta5@gmail.com>
Date: Tue, 9 Jun 2020 13:52:59 -0700
Subject: [PATCH 3/8] Use text import

---
 tests/text_test.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/text_test.py b/tests/text_test.py
index baee44e9..4da0598e 100644
--- a/tests/text_test.py
+++ b/tests/text_test.py
@@ -189,8 +189,7 @@ def test_tokenizer_filter_char_level():
     """
     x_train = ['This text has only known words this text']
 
-    tokenizer = keras.preprocessing.text.Tokenizer(char_level=True,
-                                                   filters="e")
+    tokenizer = text.Tokenizer(filters="e", char_level=True)
     tokenizer.fit_on_texts(x_train)
     assert "e" not in tokenizer.word_index
 

From bff28ffd03be1f7ba58c7899a775909976cf5c80 Mon Sep 17 00:00:00 2001
From: "Paulo S. Costa" <pauloscosta5@gmail.com>
Date: Tue, 9 Jun 2020 13:53:22 -0700
Subject: [PATCH 4/8] Modify text to not expect . count

---
 tests/text_test.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/text_test.py b/tests/text_test.py
index 4da0598e..f8f6888d 100644
--- a/tests/text_test.py
+++ b/tests/text_test.py
@@ -294,9 +294,8 @@ def test_tokenizer_lower_flag():
     char_tokenizer.fit_on_texts(texts)
     expected_word_counts = OrderedDict([('t', 11), ('h', 5), ('e', 6), (' ', 14),
                                         ('c', 2), ('a', 6), ('s', 2), ('o', 6),
-                                        ('n', 4), ('m', 1), ('.', 3), ('d', 3),
-                                        ('g', 5), ('l', 2), ('i', 2), ('v', 1),
-                                        ('r', 1)])
+                                        ('n', 4), ('m', 1), ('d', 3), ('g', 5),
+                                        ('l', 2), ('i', 2), ('v', 1), ('r', 1)])
     assert char_tokenizer.word_counts == expected_word_counts
 
 

From bc1e30f027294c4093cfbd28c2f90fc8cabbb1af Mon Sep 17 00:00:00 2001
From: "Paulo S. Costa" <pauloscosta5@gmail.com>
Date: Tue, 9 Jun 2020 14:09:53 -0700
Subject: [PATCH 5/8] Test all possible inputs

---
 tests/text_test.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/text_test.py b/tests/text_test.py
index f8f6888d..048cc1e1 100644
--- a/tests/text_test.py
+++ b/tests/text_test.py
@@ -184,11 +184,13 @@ def test_tokenizer_oov_flag_and_num_words():
     assert trans_text == 'this <unk> <unk> <unk> <unk> <unk>'
 
 
-def test_tokenizer_filter_char_level():
+@pytest.mark.parametrize(
+    "x_train",
+    ("ae", ["ae", "er"], [["ae", "er"], ["ee", "a"]])
+)
+def test_tokenizer_filter_char_level(x_train):
     """It does not tokenize filtered characters at the character level.
     """
-    x_train = ['This text has only known words this text']
-
     tokenizer = text.Tokenizer(filters="e", char_level=True)
     tokenizer.fit_on_texts(x_train)
     assert "e" not in tokenizer.word_index

From 9b255a1cb9cc731f9c06a5ca6251ce781679b862 Mon Sep 17 00:00:00 2001
From: "Paulo S. Costa" <pauloscosta5@gmail.com>
Date: Tue, 9 Jun 2020 14:10:23 -0700
Subject: [PATCH 6/8] Remove filtered characters at char level

---
 keras_preprocessing/text.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/keras_preprocessing/text.py b/keras_preprocessing/text.py
index 5ee23dbd..aca43cc1 100644
--- a/keras_preprocessing/text.py
+++ b/keras_preprocessing/text.py
@@ -222,9 +222,19 @@ def fit_on_texts(self, texts):
                 a generator of strings (for memory-efficiency),
                 or a list of list of strings.
         """
+        filtered_characters = set(self.filters)
         for text in texts:
             self.document_count += 1
             if self.char_level or isinstance(text, list):
+                if isinstance(text, list):
+                    text = ["".join(char
+                                    for char in text_elem
+                                    if char not in filtered_characters)
+                            for text_elem in text]
+                else:
+                    text = "".join(char
+                                   for char in text
+                                   if char not in filtered_characters)
                 if self.lower:
                     if isinstance(text, list):
                         text = [text_elem.lower() for text_elem in text]

From 74afe1eba070e6fc990cb94edcfdda7fd411ca4a Mon Sep 17 00:00:00 2001
From: "Paulo S. Costa" <pauloscosta5@gmail.com>
Date: Tue, 9 Jun 2020 14:47:32 -0700
Subject: [PATCH 7/8] Don't remove filtered characters on lists elements

---
 keras_preprocessing/text.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/keras_preprocessing/text.py b/keras_preprocessing/text.py
index aca43cc1..40414185 100644
--- a/keras_preprocessing/text.py
+++ b/keras_preprocessing/text.py
@@ -226,12 +226,7 @@ def fit_on_texts(self, texts):
         for text in texts:
             self.document_count += 1
             if self.char_level or isinstance(text, list):
-                if isinstance(text, list):
-                    text = ["".join(char
-                                    for char in text_elem
-                                    if char not in filtered_characters)
-                            for text_elem in text]
-                else:
+                if not isinstance(text, list):
                     text = "".join(char
                                    for char in text
                                    if char not in filtered_characters)

From 489dcfdca680a3fba52a78efc006f41c22755808 Mon Sep 17 00:00:00 2001
From: "Paulo S. Costa" <pauloscosta5@gmail.com>
Date: Tue, 9 Jun 2020 14:48:08 -0700
Subject: [PATCH 8/8] Remove test on lists of lists

---
 tests/text_test.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/tests/text_test.py b/tests/text_test.py
index 048cc1e1..97e66ebc 100644
--- a/tests/text_test.py
+++ b/tests/text_test.py
@@ -184,10 +184,7 @@ def test_tokenizer_oov_flag_and_num_words():
     assert trans_text == 'this <unk> <unk> <unk> <unk> <unk>'
 
 
-@pytest.mark.parametrize(
-    "x_train",
-    ("ae", ["ae", "er"], [["ae", "er"], ["ee", "a"]])
-)
+@pytest.mark.parametrize("x_train", ("ae", ["ae", "er"]))
 def test_tokenizer_filter_char_level(x_train):
     """It does not tokenize filtered characters at the character level.
     """