From 1b861b9f1cafceb7108f5a565c537cbad2760a9e Mon Sep 17 00:00:00 2001 From: Jeremy Singer-Vine Date: Sun, 6 Mar 2016 19:07:38 -0500 Subject: [PATCH] Rename collate_chars -> get_text collate_chars still available, to avoid breaking scripts that use it. --- README.md | 2 +- examples/notebooks/extract-table-nics.ipynb | 4 ++-- pdfplumber/page.py | 4 ++-- pdfplumber/utils.py | 3 ++- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 4bac5b25..9edcfae4 100644 --- a/README.md +++ b/README.md @@ -81,7 +81,7 @@ The `pdfplumber.Page` class is at the core of `pdfplumber`. Most things you'll d - By default, the cropped page retains objects that fall at least partly within the bounding box. If an object falls only partly within the box, its dimensions are sliced to fit the bounding box. - Calling `.crop` with `strict=True`, however, retains only objects that fall *entirely* within the bounding box. -- `.collate_chars(x_tolerance=0, y_tolerance=0)`: Collates all of the page's character objects into a single string. Adds spaces where the difference between the `x1` of one character and the `x0` of the next is greater than `x_tolerance`. Adds newline characters where the difference between the `doctop` of one character and the `doctop` of the next is greater than `y_tolerance`. +- `.get_text(x_tolerance=0, y_tolerance=0)`: Collates all of the page's character objects into a single string. Adds spaces where the difference between the `x1` of one character and the `x0` of the next is greater than `x_tolerance`. Adds newline characters where the difference between the `doctop` of one character and the `doctop` of the next is greater than `y_tolerance`. - `.extract_table(...)`: Extracts tabular data from the page. For more details see "[Extracting tables](#extracting-tables)" below. diff --git a/examples/notebooks/extract-table-nics.ipynb b/examples/notebooks/extract-table-nics.ipynb index e8ad31cf..2b06c190 100644 --- a/examples/notebooks/extract-table-nics.ipynb +++ b/examples/notebooks/extract-table-nics.ipynb @@ -294,7 +294,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Use `collate_chars` to extract the report month\n", + "### Use `get_text` to extract the report month\n", "\n", "It looks like the month of the report is listed in an area 35px to 65px from the top of the page. But there's also some other text directly above and below it. So when we crop for that area, we'll use `strict=True` to select only characters (and other objects) that are fully within the crop-box." ] @@ -329,7 +329,7 @@ } ], "source": [ - "month_chars = month_crop.collate_chars(x_tolerance=2, y_tolerance=2)\n", + "month_chars = month_crop.get_text(x_tolerance=2, y_tolerance=2)\n", "month_chars" ] }, diff --git a/pdfplumber/page.py b/pdfplumber/page.py index 4e65d1ad..8ccf2f15 100644 --- a/pdfplumber/page.py +++ b/pdfplumber/page.py @@ -171,8 +171,8 @@ def use_strategy(param, name): return table - def collate_chars(self, x_tolerance=0, y_tolerance=0): - return utils.collate_chars(self.chars, + def get_text(self, x_tolerance=0, y_tolerance=0): + return utils.get_text(self.chars, x_tolerance=x_tolerance, y_tolerance=y_tolerance) diff --git a/pdfplumber/utils.py b/pdfplumber/utils.py index 3fb2ec08..9adbbc0c 100644 --- a/pdfplumber/utils.py +++ b/pdfplumber/utils.py @@ -47,7 +47,7 @@ def collate_line(line_chars, tolerance=0): coll += char["text"] return coll -def collate_chars(chars, x_tolerance=0, y_tolerance=0): +def get_text(chars, x_tolerance=0, y_tolerance=0): if len(chars) == 0: raise Exception("List of chars is empty.") @@ -69,6 +69,7 @@ def collate_chars(chars, x_tolerance=0, y_tolerance=0): coll = "\n".join(lines) return coll +collate_chars = get_text def find_gutters(chars, orientation, min_size=5): """