Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix the bug where Python scripts fail to execute PDF text recognition… #11994

Merged
merged 3 commits into from
Apr 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 17 additions & 7 deletions paddleocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -559,8 +559,9 @@ def check_img(img, alpha_color=(255, 255, 255)):
file format: jpg, png and other image formats that opencv can decode, as well as gif and pdf formats
storage type: binary image, net image file, local image file
alpha_color: Background color in images in RGBA format
return: numpy.array (h, w, 3)
return: numpy.array (h, w, 3) or list (p, h, w, 3) (p: page of pdf), boolean, boolean
"""
flag_gif, flag_pdf = False, False
if isinstance(img, bytes):
img = img_decode(img)
if isinstance(img, str):
Expand Down Expand Up @@ -589,17 +590,17 @@ def check_img(img, alpha_color=(255, 255, 255)):
img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
except:
logger.error("error in loading image:{}".format(image_file))
return None
return None, flag_gif, flag_pdf
if img is None:
logger.error("error in loading image:{}".format(image_file))
return None
return None, flag_gif, flag_pdf
# single channel image array.shape:h,w
if isinstance(img, np.ndarray) and len(img.shape) == 2:
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
# four channel image array.shape:h,w,c
if isinstance(img, np.ndarray) and len(img.shape) == 3 and img.shape[2] == 4:
img = alpha_to_color(img, alpha_color)
return img
return img, flag_gif, flag_pdf


class PaddleOCR(predict_system.TextSystem):
Expand Down Expand Up @@ -700,9 +701,9 @@ def ocr(
"Since the angle classifier is not initialized, it will not be used during the forward process"
)

img = check_img(img, alpha_color)
img, flag_gif, flag_pdf = check_img(img, alpha_color)
# for infer pdf file
if isinstance(img, list):
if isinstance(img, list) and flag_pdf:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这样改,对处理gif会不会有影响

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

返回 flag_gif 和 flag_pdf是不是很有必要,这里判断它是不是list,应该也是可以达到目标的。

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. 我测试了,不会对gif造成影响,并且从代码中得知判断gif的文件的原因是读取文件的方式与普通图片不同,至于后续处理应该都是一样的,并不会和pdf一样出现因为存在多页而出现错误。
  2. 是的,目前看来判断是不是list也可以达到目标,但是根据数据类型判断感觉不太稳妥,而代码中既然有flag_pdf这个判断标准,感觉还是加上这个判断条件比较符合相关函数的定义和代码逻辑,且不会影响到后续的设计。

if self.page_num > len(img) or self.page_num == 0:
imgs = img
else:
Expand Down Expand Up @@ -837,7 +838,16 @@ def __call__(
img_idx=0,
alpha_color=(255, 255, 255),
):
img = check_img(img, alpha_color)
img, flag_gif, flag_pdf = check_img(img, alpha_color)
if isinstance(img, list) and flag_pdf:
res_list = []
for index, pdf_img in enumerate(img):
logger.info("processing {}/{} page:".format(index + 1, len(img)))
res, _ = super().__call__(
pdf_img, return_ocr_result_in_table, img_idx=index
)
res_list.append(res)
return res_list
res, _ = super().__call__(img, return_ocr_result_in_table, img_idx=img_idx)
return res
Comment on lines +850 to 852
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里返回类型发生了改变,会不会对用户使用造成困扰。建议参考ocr部分处理一下。

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

关于返回类型发生了改变,会不会对用户使用造成困扰。参考ocr部分可知,它对返回类型的处理为,img如果是list,则不做改变,如果不是list,则把他放入一个list里返回,即都处理成一个list。然而,对于PPStructure类,它的定义和ocr不同,似乎是设计为返回单个页面的结果,main函数验证了我的猜想,目前命令行的方式里调用PPStructure是让它返回单个值的,如果按照OCR的部分处理的话,势必要改变main函数,我觉得还是暂时不动比较好。因为您那边可能对后续如何编写有其它设计,我尽量不改变已有的操作方式。


Expand Down
56 changes: 56 additions & 0 deletions ppstructure/docs/quickstart.md
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,62 @@ for line in result:
print(line)
```

```python
import os
import cv2
from paddleocr import PPStructure,save_structure_res

ocr_engine = PPStructure(table=False, ocr=True, show_log=True)

save_folder = './output'
img_path = 'ppstructure/recovery/UnrealText.pdf'
result = ocr_engine(img_path)
for index, res in enumerate(result):
save_structure_res(res, save_folder, os.path.basename(img_path).split('.')[0], index)

for res in result:
for line in res:
line.pop('img')
print(line)
```

```python
import os
import cv2
import numpy as np
from paddleocr import PPStructure,save_structure_res
from paddle.utils import try_import
from PIL import Image

ocr_engine = PPStructure(table=False, ocr=True, show_log=True)

save_folder = './output'
img_path = 'ppstructure/recovery/UnrealText.pdf'

fitz = try_import("fitz")
imgs = []
with fitz.open(img_path) as pdf:
for pg in range(0, pdf.page_count):
page = pdf[pg]
mat = fitz.Matrix(2, 2)
pm = page.get_pixmap(matrix=mat, alpha=False)

# if width or height > 2000 pixels, don't enlarge the image
if pm.width > 2000 or pm.height > 2000:
pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)

img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
imgs.append(img)

for index, img in enumerate(imgs):
result = ocr_engine(img)
save_structure_res(result, save_folder, os.path.basename(img_path).split('.')[0], index)
for line in result:
line.pop('img')
print(line)
```

<a name="224"></a>

#### 2.2.4 表格识别
Expand Down
56 changes: 56 additions & 0 deletions ppstructure/docs/quickstart_en.md
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,62 @@ for line in result:
print(line)
```

```python
import os
import cv2
from paddleocr import PPStructure,save_structure_res

ocr_engine = PPStructure(table=False, ocr=True, show_log=True)

save_folder = './output'
img_path = 'ppstructure/recovery/UnrealText.pdf'
result = ocr_engine(img_path)
for index, res in enumerate(result):
save_structure_res(res, save_folder, os.path.basename(img_path).split('.')[0], index)

for res in result:
for line in res:
line.pop('img')
print(line)
```

```python
import os
import cv2
import numpy as np
from paddleocr import PPStructure,save_structure_res
from paddle.utils import try_import
from PIL import Image

ocr_engine = PPStructure(table=False, ocr=True, show_log=True)

save_folder = './output'
img_path = 'ppstructure/recovery/UnrealText.pdf'

fitz = try_import("fitz")
imgs = []
with fitz.open(img_path) as pdf:
for pg in range(0, pdf.page_count):
page = pdf[pg]
mat = fitz.Matrix(2, 2)
pm = page.get_pixmap(matrix=mat, alpha=False)

# if width or height > 2000 pixels, don't enlarge the image
if pm.width > 2000 or pm.height > 2000:
pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)

img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
imgs.append(img)

for index, img in enumerate(imgs):
result = ocr_engine(img)
save_structure_res(result, save_folder, os.path.basename(img_path).split('.')[0], index)
for line in result:
line.pop('img')
print(line)
```

<a name="224"></a>
#### 2.2.4 table recognition

Expand Down
Loading