GEM-benchmark · richplant · Oct 29, 2021
diff --git a/filters/universal_bias/README.md b/filters/universal_bias/README.md
@@ -2,7 +2,7 @@
 
 ## What type of a filter is this?
 
-This filter is currently contains lexical seeds for 10 categories () in English, however it can be extended to any language or topic by simple addition of desired entries to `lexicals.json` 
+This filter is currently contains lexical seeds for 10 categories () in US and UK English, however it can be extended to any language or topic by simple addition of desired entries to `lexicals.json` 
 file in current directoryalong with the text corpus in corresponding language.
 The minority parameter is a potentially underrepresented group, defined with its own set of keywords; the majority parameter is a set of keywords, representing the dominating group.
 The filter returns "True" if the minority group is indeed underrepresented, "False" otherwise.
@@ -30,13 +30,13 @@ sentences = [ "He is going to make a cake.",
               "Nobody likes washing dishes",
               "He agreed to help me" ]
 
-language = "en"
+culture = "en_us"
 category = "gender"
 minority = "female"
 majority = "male"
 
 
-f = UniversalBiasFilter(language, category, minority, majority)
+f = UniversalBiasFilter(culture, category, minority, majority)
 
 f.filter(sentences)
 ```
@@ -78,7 +78,7 @@ This filter accepts unigram arrays, the n-gramms won't give the desired output,
 ## Structure of lexical seeds
 Current struncture of the `lexicals.json` is as follows:
 ```
-"en": {
+"en_us": {
 		"religion": {
 			"christianity": [],
 			"buddhism_hinduism_jainism": [],
@@ -120,7 +120,7 @@ Current struncture of the `lexicals.json` is as follows:
 			"old": []			
 		},
 
-		"appearencence": {
+		"appearance": {
 			"attractive": [],
 			"unattractive": []
 		},
@@ -146,7 +146,27 @@ Current struncture of the `lexicals.json` is as follows:
 		}	
 	}
 ```
-Changing the language key with the corresponding lexical seeds precision allows adapt this data structure to any language. 
+Changing the culture key with the corresponding lexical seeds precision allows adapt this data structure to any language/culture pair. However, since the categories may not remain stable across cultural boundaries, the list of groups can be determined by calling the following function:
+
+```
+UniversalBiasFilter.list_groups("en_us")
+
+{
+    'religion': ['christianity', 'buddhism_hinduism_jainism', 'confucianism', 'islam', 'judaism', 'atheism'], 
+    'race': ['white', 'black', 'asian', 'latinx', 'american_indian'], 
+    'ethnicity': ['european', 'african', 'eurasian', 'asian', 'hispanic', 'american_indian'], 
+    'gender': ['male', 'female'], 
+    'sexual_orientation': ['hetero', 'homo'], 
+    'age': ['young', 'old'], 
+    'appearencence': ['attractive', 'unattractive'], 
+    'disability': ['healthy', 'disabled'], 
+    'experience': ['experienced', 'inexperienced'], 
+    'education': ['educated', 'uneducated'], 
+    'economic_status': ['rich', 'poor']
+}
+
+```
+
 The categories with their respective attributes can also be modified (the data extraction is made dynamicaly in the code). 
 
 

diff --git a/filters/universal_bias/filter.py b/filters/universal_bias/filter.py
@@ -1,14 +1,16 @@
 from interfaces.SentenceOperation import SentenceOperation
 from tasks.TaskTypes import TaskType
 import re
+import json
+
 
 class UniversalBiasFilter(SentenceOperation):
     tasks = [TaskType.TEXT_TO_TEXT_GENERATION]
     keywords = ["rule-based", "social-reasoning"]
 
-    def __init__(self, language=None, category=None, minority_group=None, majority_group=None, minority=None, majority=None):
+    def __init__(self, culture=None, category=None, minority_group=None, majority_group=None, minority=None, majority=None):
         super().__init__()
-        self.language = language
+        self.culture = culture
         self.category = category
         self.minority_group = minority_group
         self.majority_group = majority_group
@@ -31,10 +33,10 @@ def flag_sentences(self, sentences):
 
         # Retrieve relevant data extracts
         try:
-            minority_group = data[self.language][self.category][self.minority_group]
-            majority_group = data[self.language][self.category][self.majority_group]
+            minority_group = data[self.culture][self.category][self.minority_group]
+            majority_group = data[self.culture][self.category][self.majority_group]
         except NameError as error:
-            print('The specified language, category of group is not supported or misformatted. Please provide valid arguments to the filter() method.') 
+            print('The specified culture, category of group is not supported or misformatted. Please provide valid arguments to the filter() method.') 
 
         # Close names file
         f.close()
@@ -163,6 +165,16 @@ def sort_groups(flagged_corpus):
 
         return minority_group, majority_group, neutral_group
 
+    @staticmethod
+    def list_groups(culture):
+        # Read json
+        with open('filters/universal_bias/lexicals.json', encoding='utf-8') as f:
+            data = json.load(f)
+            try:
+                group_dict = data[culture]
+            except NameError:
+                print('The specified culture is not a valid entry.')
+        return {key: list(val.keys()) for key, val in group_dict.items()}
 
     def filter(self, sentences: []) -> bool:
         """
@@ -183,7 +195,6 @@ def filter(self, sentences: []) -> bool:
         minority_percentage = 100 * float(minority_count) / float(len(sentences))
         majority_percentage = 100 * float(majority_count) / float(len(sentences))
 
-
         # If the number of sentences in terms of percentage in the minority group
         # is lower than in the majority group, set bias to True
         # Note, that the neutral group is not taken into account in this calculation
@@ -192,4 +203,4 @@ def filter(self, sentences: []) -> bool:
         else:
             biased = False
 
-        return biased
+        return biased