lucaromagnoli · lucaromagnoli · Aug 8, 2024 · Aug 6, 2024 · Aug 6, 2024 · Aug 6, 2024
diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
@@ -20,7 +20,7 @@ jobs:
       - name: Install Python
         uses: actions/setup-python@v4
         with:
-          python-version: "3.11"
+          python-version: "3.12"
         # see details (matrix, python-version, python-version-file, etc.)
         # https://github.com/actions/setup-python
       - name: Install poetry

diff --git a/.gitignore b/.gitignore
@@ -139,3 +139,4 @@ dmypy.json
 cython_debug/
 
 .idea/
+/temp/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,16 +1,16 @@
 repos:
-  -   repo: https://github.com/pre-commit/pre-commit-hooks
-      rev: v2.3.0
-      hooks:
-        -   id: check-yaml
-        -   id: end-of-file-fixer
-        -   id: trailing-whitespace
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v2.3.0
+    hooks:
+      -   id: check-yaml
+      -   id: end-of-file-fixer
+      -   id: trailing-whitespace
   - repo: https://github.com/astral-sh/ruff-pre-commit
     # Ruff version.
     rev: v0.5.6
     hooks:
-      # Run the linter.
+      # Run the linter and sort imports.
       - id: ruff
-        args: [ --fix ]
+        args: [--fix]
       # Run the formatter.
       - id: ruff-format
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -0,0 +1,22 @@
+version: 2
+
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.12"
+
+
+sphinx:
+  configuration: source/conf.py
+
+# Optionally build your docs in additional formats such as PDF and ePub
+# formats:
+#    - pdf
+#    - epub
+
+# Optional but recommended, declare the Python requirements required
+# to build your documentation
+# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
+# python:
+#    install:
+#    - requirements: docs/requirements.txt
diff --git a/Makefile b/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/README.md b/README.md
diff --git a/README.rst b/README.rst
@@ -0,0 +1,49 @@
+DataService
+===========
+
+Lightweight - async - data gathering for Python.
+____________________________________________________________________________________
+DataService is a lightweight data gathering library for Python.
+
+Designed for simplicity, it uses common web scraping and data gathering patterns.
+
+No complex API to learn, just standard Python idioms.
+
+Asynchronous implementation, synchronous interface.
+
+How to use DataService
+-------
+
+To start, create a ``DataService`` instance with an ``Iterable`` of ``Request`` objects. This setup provides you with an ``Iterator`` of data objects that you can then iterate over or convert to a ``list``, ``tuple``, a ``pd.DataFrame`` or any data structure of choice.
+
+.. code-block:: python
+
+    start_requests = [Request(url="https://books.toscrape.com/index.html", callback=parse_books_page, client=HttpXClient())]
+    data_service = DataService(start_requests)
+    data = tuple(data_service)
+
+A ``Request`` is a ``Pydantic`` model that includes the URL to fetch, a reference to the ``client`` callable, and a ``callback`` function for parsing the ``Response`` object.
+
+The client can be any Python callable that accepts a ``Request`` object and returns a ``Response`` object. ``DataService`` provides an ``HttpXClient`` class, which is based on the ``httpx`` library, but you are free to use your own custom async client.
+
+The callback function processes a ``Response`` object and returns either ``data`` or additional ``Request`` objects.
+
+In this trivial example we are requesting the `Books to Scrape <https://books.toscrape.com/index.html>`_ homepage and parsing the number of books on the page.
+
+Example ``parse_books_page`` function:
+
+.. code-block:: python
+
+    def parse_books_page(response: Response):
+        articles = response.soup.find_all("article", {"class": "product_pod"})
+        return {
+            "url": response.request.url,
+            "title": response.soup.title.get_text(strip=True),
+            "articles": len(articles),
+        }
+
+This function takes a ``Response`` object, which has a ``soup`` attribute (a ``BeautifulSoup`` object of the HTML content). The function parses the HTML content and returns data.
+
+The callback function can ``return`` or ``yield`` either ``data`` (dict or dataclass) or more ``Request`` objects.
+
+If you have used Scrapy before, you will find this pattern familiar.
diff --git a/dataservice/__init__.py b/dataservice/__init__.py
@@ -1,17 +1,22 @@
 from dataservice.clients import HttpXClient
 from dataservice.config import ServiceConfig
+from dataservice.data import BaseDataItem, DataWrapper
 from dataservice.exceptions import RequestException, RetryableRequestException
+from dataservice.logs import setup_logging
 from dataservice.models import Request, Response
-from dataservice.pipeline import Pipeline
 from dataservice.service import DataService
 
 __all__ = [
+    "BaseDataItem",
     "DataService",
+    "DataWrapper",
     "HttpXClient",
-    "Pipeline",
     "Request",
     "Response",
     "RequestException",
     "RetryableRequestException",
     "ServiceConfig",
+    "setup_logging",
 ]
+
+__version__ = "0.0.1"
diff --git a/dataservice/clients.py b/dataservice/clients.py
@@ -19,10 +19,17 @@ def __init__(self):
         self.async_client = httpx.AsyncClient
 
     def __call__(self, *args, **kwargs):
+        """Make a request using the client."""
         return self.make_request(*args, **kwargs)
 
     async def make_request(self, request: Request) -> Response | NoReturn:
-        """Make a request and handle exceptions."""
+        """Make a request and handle exceptions.
+
+        :param request: The request object containing the details of the HTTP request.
+        :return: A Response object if the request is successful.
+        :raises RequestException: If a non-retryable HTTP error occurs.
+        :raises RetryableRequestException: If a retryable HTTP error occurs.
+        """
         try:
             return await self._make_request(request)
         except httpx.HTTPStatusError as e:
@@ -46,9 +53,15 @@ async def make_request(self, request: Request) -> Response | NoReturn:
             raise RequestException(str(e))
 
     async def _make_request(self, request: Request) -> Response:
-        """Make a request using HTTPX."""
+        """Make a request using HTTPX. Private method for internal use.
+
+        :param request: The request object containing the details of the HTTP request.
+        :return: A Response object containing the response data.
+        """
         logger.info(f"Requesting {request.url}")
-        async with self.async_client(headers=request.headers) as client:
+        async with self.async_client(
+            headers=request.headers, proxy=request.proxy
+        ) as client:
             match request.method:
                 case "GET":
                     response = await client.get(request.url, params=request.params)
@@ -62,8 +75,8 @@ async def _make_request(self, request: Request) -> Response:
             response.raise_for_status()
             match request.content_type:
                 case "text":
-                    data = response.text
+                    data = None
                 case "json":
                     data = response.json()
         logger.info(f"Returning response for {request.url}")
-        return Response(request=request, data=data)
+        return Response(request=request, text=response.text, data=data)
diff --git a/dataservice/config.py b/dataservice/config.py
@@ -1,7 +1,7 @@
-from typing import NewType, Annotated
+from typing import Annotated, NewType
 
 from annotated_types import Ge
-from pydantic import BaseModel
+from pydantic import BaseModel, Field
 
 PositiveInt = Annotated[int, Ge(0)]
 Milliseconds = NewType("Milliseconds", PositiveInt)
@@ -19,7 +19,21 @@ class RetryConfig(BaseModel):
 class ServiceConfig(BaseModel):
     """Global configuration for the service."""
 
-    deduplication: bool = True
-    max_concurrency: PositiveInt = 10
-    random_delay: Milliseconds = Milliseconds(0)
-    retry: RetryConfig = RetryConfig()
+    retry: RetryConfig = Field(
+        default_factory=RetryConfig, description="The retry configuration."
+    )
+    deduplication: bool = Field(
+        default=True, description="Whether to deduplicate requests."
+    )
+    max_concurrency: PositiveInt = Field(
+        default=10, description="The maximum number of concurrent requests."
+    )
+    random_delay: Milliseconds = Field(
+        default=Milliseconds(0),
+        description="The maximum random delay between requests.",
+    )
+
+    cache: bool = Field(default=False, description="Whether to cache requests.")
+    cache_name: str = Field(
+        default="cache", description="A name to use for the cache. Defaults to 'cache'."
+    )
Original file line number	Diff line number	Diff line change
Expand Up		@@ -139,3 +139,4 @@ dmypy.json
		cython_debug/

		.idea/
		/temp/