Skip to content

Commit

Permalink
Update benchmarks
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Jan 7, 2025
1 parent 8c3eb5f commit dd667d2
Show file tree
Hide file tree
Showing 15 changed files with 39 additions and 40 deletions.
10 changes: 5 additions & 5 deletions .github/workflows/benchmarks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,20 +21,20 @@ jobs:
- name: Run detection benchmark test
run: |
poetry run python benchmark/detection.py --max 2
poetry run python scripts/verify_benchmark_scores.py results/benchmark/det_bench/results.json --bench_type detection
poetry run python benchmark/utils/verify_benchmark_scores.py results/benchmark/det_bench/results.json --bench_type detection
- name: Run recognition benchmark test
run: |
poetry run python benchmark/recognition.py --max 2
poetry run python scripts/verify_benchmark_scores.py results/benchmark/rec_bench/results.json --bench_type recognition
poetry run python benchmark/utils/verify_benchmark_scores.py results/benchmark/rec_bench/results.json --bench_type recognition
- name: Run layout benchmark test
run: |
poetry run python benchmark/layout.py --max 5
poetry run python scripts/verify_benchmark_scores.py results/benchmark/layout_bench/results.json --bench_type layout
poetry run python benchmark/utils/verify_benchmark_scores.py results/benchmark/layout_bench/results.json --bench_type layout
- name: Run ordering benchmark
run: |
poetry run python benchmark/ordering.py --max 5
poetry run python scripts/verify_benchmark_scores.py results/benchmark/order_bench/results.json --bench_type ordering
poetry run python benchmark/utils/verify_benchmark_scores.py results/benchmark/order_bench/results.json --bench_type ordering
- name: Run table recognition benchmark
run: |
poetry run python benchmark/table_recognition.py --max 5
poetry run python scripts/verify_benchmark_scores.py results/benchmark/table_rec_bench/results.json --bench_type table_recognition
poetry run python benchmark/utils/verify_benchmark_scores.py results/benchmark/table_rec_bench/results.json --bench_type table_recognition
6 changes: 3 additions & 3 deletions benchmark/detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
import copy
import json

from surya.benchmark.bbox import get_pdf_lines
from surya.benchmark.metrics import precision_recall
from surya.benchmark.tesseract import tesseract_parallel
from benchmark.utils.bbox import get_pdf_lines
from benchmark.utils.metrics import precision_recall
from benchmark.utils.tesseract import tesseract_parallel
from surya.input.processing import open_pdf, get_page_images, convert_if_not_rgb
from surya.postprocessing.heatmap import draw_polys_on_image
from surya.postprocessing.util import rescale_bbox
Expand Down
2 changes: 1 addition & 1 deletion benchmark/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import copy
import json

from surya.benchmark.metrics import precision_recall
from benchmark.utils.metrics import precision_recall
from surya.layout import LayoutPredictor
from surya.input.processing import convert_if_not_rgb
from surya.postprocessing.heatmap import draw_bboxes_on_image
Expand Down
2 changes: 1 addition & 1 deletion benchmark/ordering.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from surya.layout import LayoutPredictor
from surya.schema import Bbox
from surya.settings import settings
from surya.benchmark.metrics import rank_accuracy
from benchmark.utils.metrics import rank_accuracy
import os
import time
import datasets
Expand Down
4 changes: 2 additions & 2 deletions benchmark/recognition.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import argparse
from collections import defaultdict

from benchmark.scoring import overlap_score
from benchmark.utils.scoring import overlap_score
from surya.input.processing import convert_if_not_rgb
from surya.postprocessing.text import draw_text_on_image
from surya.recognition import RecognitionPredictor
from surya.settings import settings
from surya.recognition.languages import CODE_TO_LANGUAGE
from surya.benchmark.tesseract import tesseract_ocr_parallel, surya_lang_to_tesseract, TESS_CODE_TO_LANGUAGE
from benchmark.utils.tesseract import tesseract_ocr_parallel, surya_lang_to_tesseract, TESS_CODE_TO_LANGUAGE
import os
import datasets
import json
Expand Down
4 changes: 2 additions & 2 deletions benchmark/table_recognition.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
from surya.input.processing import convert_if_not_rgb
from surya.table_rec import TableRecPredictor
from surya.settings import settings
from surya.benchmark.metrics import penalized_iou_score
from surya.benchmark.tatr import load_tatr, batch_inference_tatr
from benchmark.utils.metrics import penalized_iou_score
from benchmark.utils.tatr import load_tatr, batch_inference_tatr
import os
import time
import datasets
Expand Down
Empty file added benchmark/utils/__init__.py
Empty file.
23 changes: 23 additions & 0 deletions surya/benchmark/util.py → benchmark/utils/bbox.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,26 @@
import fitz as pymupdf
from surya.postprocessing.util import rescale_bbox


def get_pdf_lines(pdf_path, img_sizes):
doc = pymupdf.open(pdf_path)
page_lines = []
for idx, img_size in enumerate(img_sizes):
page = doc[idx]
blocks = page.get_text("dict", sort=True, flags=pymupdf.TEXTFLAGS_DICT & ~pymupdf.TEXT_PRESERVE_LIGATURES & ~pymupdf.TEXT_PRESERVE_IMAGES)["blocks"]

line_boxes = []
for block_idx, block in enumerate(blocks):
for l in block["lines"]:
line_boxes.append(list(l["bbox"]))

page_box = page.bound()
pwidth, pheight = page_box[2] - page_box[0], page_box[3] - page_box[1]
line_boxes = [rescale_bbox(bbox, (pwidth, pheight), img_size) for bbox in line_boxes]
page_lines.append(line_boxes)

return page_lines

def merge_boxes(box1, box2):
return (min(box1[0], box2[0]), min(box1[1], box2[1]), max(box1[2], box2[2]), max(box1[3], box2[3]))

Expand Down
File renamed without changes.
File renamed without changes.
4 changes: 1 addition & 3 deletions surya/benchmark/tatr.py → benchmark/utils/tatr.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import torch
from transformers import DetrFeatureExtractor, AutoModelForObjectDetection
from transformers import AutoModelForObjectDetection
from surya.settings import settings

from PIL import Image
import numpy as np


Expand Down
File renamed without changes.
File renamed without changes.
22 changes: 0 additions & 22 deletions surya/benchmark/bbox.py

This file was deleted.

2 changes: 1 addition & 1 deletion surya/input/pdflines.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from pdftext.extraction import dictionary_output

from surya.postprocessing.text import sort_text_lines
from surya.recognition.util import sort_text_lines
from surya.schema import PolygonBox
import numpy as np

Expand Down

0 comments on commit dd667d2

Please sign in to comment.