EveripediaNetwork · brunneis · Oct 24, 2024
diff --git a/README.md b/README.md
@@ -33,7 +33,64 @@ client = Nootropic(
     # system='System prompt.',
     # disable_cache=False,
 )
+```
+
+# Extractors
+
+Nootropic provides built-in extractors to help parse structured data from LLM responses. Here's how to use them:
+
+## XML Extractor
+
+The `XMLExtractor` can parse XML-like structures from the text, returning a list of dictionaries. You can optionally filter by a specific tag.
+
+```python
+from nootropic.extractors import XMLExtractor
+
+# Example LLM response with XML
+llm_response = '''
+Some text...
+<person>
+    <name>John Doe</name>
+    <age>30</age>
+</person>
+More text...
+<person>
+    <name>Jane Smith</name>
+    <age>25</age>
+</person>
+Even more text...
+'''
+
+# Extract all XML structures
+xml_data = XMLExtractor.extract(llm_response)
+print('All XML data:', xml_data)
+
+# Extract XML structures with a specific tag
+tagged_xml_data = XMLExtractor.extract(
+    llm_response,
+    tag='person',
+)
+print('Tagged XML data:', tagged_xml_data)
+```
+
+## JSON Extractor
+
+The `JSONExtractor` finds and parses all valid JSON objects in the text, returning a list of parsed JSON objects.
+
+```python
+from nootropic.extractors import JSONExtractor
+
+# Example LLM response with JSON
+llm_response = '''
+Some text...
+{"name": "John Doe", "age": 30}
+More text...
+{"name": "Jane Smith", "age": 25, "city": "New York"}
+Even more text...
+'''
 
+json_data = JSONExtractor.extract(llm_response)
+print('JSON data:', json_data)
 ```
 
 # Supported SDKs

diff --git a/nootropic/__init__.py b/nootropic/__init__.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
-from .nootropic import *
+from .nootropic import Nootropic  # noqa: F401
+from .extractors import XMLExtractor, JSONExtractor  # noqa: F401
 
-__version__ = '1.2410.1'
+__version__ = '1.2410.2'
diff --git a/nootropic/extractors.py b/nootropic/extractors.py
@@ -0,0 +1,65 @@
+import re
+import json
+from typing import Dict, List, Optional, Union, Any
+from abc import ABC, abstractmethod
+
+
+class BaseExtractor(ABC):
+    @staticmethod
+    @abstractmethod
+    def extract(
+        text: str,
+        **kwargs: Any,
+    ) -> Union[Dict[str, Any], List[Any]]:
+        pass
+
+
+class XMLExtractor(BaseExtractor):
+    @staticmethod
+    def extract(
+        text: str,
+        tag: Optional[str] = None,
+    ) -> Union[Dict[str, Any], List[str]]:
+        def parse_xml(xml_string: str) -> Dict[str, Any]:
+            pattern = r'<(\w+)(?:\s+[^>]*)?>(.*?)</\1>'
+            matches = re.findall(pattern, xml_string, re.DOTALL)
+            result = {}
+            for tag_name, content in matches:
+                content = content.strip()
+                if re.search(r'<\w+', content):
+                    result[tag_name] = parse_xml(content)
+                else:
+                    result[tag_name] = content
+            return result
+
+        if tag:
+            pattern = '<{0}>(.*?)</{0}>'.format(tag)
+            matches = re.findall(pattern, text, re.DOTALL)
+            return [
+                parse_xml('<root>{0}</root>'.format(m))['root']
+                for m in matches
+            ]
+        else:
+            return parse_xml(text)
+
+
+class JSONExtractor(BaseExtractor):
+    @staticmethod
+    def extract(text: str) -> List[Dict[str, Any]]:
+        def find_json_objects(text: str) -> List[str]:
+            # Find all JSON-like structures in the text
+            json_pattern = r'\{(?:[^{}]|\{(?:[^{}]|\{[^{}]*\})*\})*\}'
+            return re.findall(json_pattern, text)
+
+        def parse_json_object(json_str: str) -> Optional[Dict[str, Any]]:
+            try:
+                return json.loads(json_str)
+            except json.JSONDecodeError:
+                # If parsing fails, return None
+                return None
+
+        json_strings = find_json_objects(text)
+        return [
+            json_obj for json_obj in map(parse_json_object, json_strings)
+            if json_obj is not None
+        ]