Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix ignore option #135

Merged
merged 2 commits into from
May 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 13 additions & 5 deletions .github/workflows/endtoend.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ jobs:
python-version: ["3.9.13", "3.12.3"]
os: [ubuntu-latest, macos-latest, windows-latest]
lang: ["es", "en"]
ignore: [false, true]
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v4
Expand All @@ -22,22 +23,29 @@ jobs:
uses: abatilo/[email protected]
with:
poetry-version: "1.7.1"
- name: Install fa-scrapper
- name: Install fa-scraper
run: poetry install
- name: Install csv-diff
# Install csv-diff with https://github.com/simonw/csv-diff/pull/19
run: python3 -m pip install git+https://github.com/mikecoop83/csv-diff@c3d32f758343a2ba3737d612e6e906fd9d77322b
- name: Run fa-scrapper
- name: Run fa-scraper
env:
TEST_ACCOUNT_ID: ${{ secrets.TEST_ACCOUNT_ID }}
FA_LANG: ${{ matrix.lang }}
run: poetry run fa-scrapper ${TEST_ACCOUNT_ID} --lang ${FA_LANG} --csv output.csv
run: |
if [ -n ${{ matrix.ignore }} ]; then
EXTRA_FLAGS="--ignore TV --ignore TVMS --ignore TVS"
fi
poetry run fa-scraper ${TEST_ACCOUNT_ID} --lang ${FA_LANG} --csv output.csv ${EXTRA_FLAGS}
shell: bash
- name: Check output
env:
FA_LANG: ${{ matrix.lang }}
run: |
OUTPUT=$(csv-diff --encoding "utf-8" output.csv testdata/expected-${FA_LANG}.csv)
if [ -n ${{ matrix.ignore }} ]; then
SUFFIX="-ignore"
fi
OUTPUT=$(csv-diff --encoding "utf-8" output.csv testdata/expected-${FA_LANG}${SUFFIX}.csv)
if [ -n "$OUTPUT" ]; then
echo "$OUTPUT"
exit 1
Expand All @@ -47,5 +55,5 @@ jobs:
if: failure()
uses: actions/upload-artifact@v4
with:
name: output-${{ matrix.lang }}-${{ matrix.os }}-${{ matrix.python-version }}
name: output-${{ matrix.lang }}-${{ matrix.os }}-${{ matrix.python-version }}-${{ matrix.ignore }}
path: output.csv
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@

## Unreleased

- (Enhacement) Print page and title that caused an exception (#xx)
- (Breaking) Remove option to ignore 'S' (#135)
- (Breaking) Bump minimum supported Python version to 3.9 (#126)
- (Enhacement) Print page and title that caused an exception (#133)

## 0.3.2 (01-05-2022)

Expand Down
2 changes: 1 addition & 1 deletion fa_scraper/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def main():
)
parser.add_argument(
"--ignore",
help="ignore category (default: all)",
help="ignore category (default: none)",
type=FACategory,
choices=FACategory,
action="append",
Expand Down
106 changes: 66 additions & 40 deletions fa_scraper/fa_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,22 @@
"Directors",
)

SKIP_BY_LANG = {
Lang.ES: {
FACategory.TVS: "Serie",
FACategory.TVMS: "Miniserie",
FACategory.TV: "TV",
},
Lang.EN: {
FACategory.TVS: "tv series",
FACategory.TVMS: "miniseries",
FACategory.TV: "TV",
},
}

TITLE_ERROR_TEMPLATE = "Unexpected error while parsing data for title '{title}'"
PAGE_ERROR_TEMPLATE = "Unexpected error while parsing data on page '{page}'"
SKIP_TITLE_TEMPLATE = "Skipping {title} since it is a '{title_type}'"


def get_date(tag: bs4.element.Tag, lang: Lang) -> str:
Expand Down Expand Up @@ -82,31 +96,14 @@ def sanitize_director_tag(d: bs4.element.Tag) -> str:
)


def is_chosen_category(
tag: bs4.element.Tag, lang: Lang, ignore_list: Iterable[FACategory]
def should_skip_type(
title_type: str, lang: Lang, ignore_list: Iterable[FACategory]
) -> bool:
"""Checks if given tag is within the chosen categories"""

title = tag.find_all(class_="mc-title")[0].a.string.strip()

if lang == Lang.ES:
skipdct = {
FACategory.TVS: "(Serie de TV)",
FACategory.TVMS: "(Miniserie de TV)",
FACategory.TV: "(TV)",
FACategory.S: "(C)",
}
else:
skipdct = {
FACategory.TVS: "(TV Series)",
FACategory.TVMS: "(TV Miniseries)",
FACategory.TV: "(TV)",
FACategory.S: "(S)",
}

skip = map(skipdct.get, ignore_list)

return not any(title.endswith(suffix) for suffix in skip)
"""Checks if given title type should be skipped."""
for category in ignore_list:
if title_type == SKIP_BY_LANG[lang][category]:
return True
return False


def pages_from(template: str) -> Iterator[Page]:
Expand Down Expand Up @@ -149,9 +146,24 @@ def get_profile_data(
for tag in tags:
if tag["class"] == ["user-ratings-header"]:
cur_date = get_date(tag, lang)
elif is_chosen_category(tag, lang, ignore_list):
else:
try:
title = tag.find_all(class_="mc-title")[0].a
title_name = title.string.strip()
title_type = tag.find_all(class_="d-flex")[0].find_all(
class_="type"
)
if title_type and should_skip_type(
title_type[0].string.strip(), lang, ignore_list
):
print(
SKIP_TITLE_TEMPLATE.format(
title=title_name,
title_type=title_type[0].string.strip(),
)
)
continue

yield {
"Title": title.string.strip(),
"Year": int(
Expand Down Expand Up @@ -187,21 +199,35 @@ def get_list_data(
tags = page.contents.find_all(class_=["movie-wrapper"])

for tag in tags:
if is_chosen_category(tag, lang, ignore_list):
try:
title = tag.find_all(class_="mc-title")[0].a
yield {
"Title": title.string.strip(),
"Year": int(
tag.find_all(class_="d-flex")[0]
.find_all(class_="mc-year")[0]
.string.strip()
),
"Directors": get_directors(tag),
}
except:
print(TITLE_ERROR_TEMPLATE.format(title=title.string.strip()))
raise
try:
title = tag.find_all(class_="mc-title")[0].a
title_name = title.string.strip()
title_type = tag.find_all(class_="d-flex")[0].find_all(
class_="type"
)
if title_type and should_skip_type(
title_type[0].string.strip(), lang, ignore_list
):
print(
SKIP_TITLE_TEMPLATE.format(
title=title_name,
title_type=title_type[0].string.strip(),
)
)
continue

yield {
"Title": title_name,
"Year": int(
tag.find_all(class_="d-flex")[0]
.find_all(class_="mc-year")[0]
.string.strip()
),
"Directors": get_directors(tag),
}
except:
print(TITLE_ERROR_TEMPLATE.format(title=title_name))
raise
except:
print(PAGE_ERROR_TEMPLATE.format(page=page.url))
raise
Expand Down
1 change: 0 additions & 1 deletion fa_scraper/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ class FACategory(Enum):
TVS = "TVS"
TVMS = "TVMS"
TV = "TV"
S = "S"

def __str__(self):
"""Returns category"""
Expand Down
5 changes: 5 additions & 0 deletions testdata/expected-en-ignore.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Title,Year,Directors,WatchedDate,Rating,Rating10
Attack on Titan: Chronicle,2020,"Masashi Koizuka, Tetsurō Araki",2022-01-03,4.0,8
Full Metal Jacket,1987,Stanley Kubrick,2021-11-10,2.0,4
The Lion King,1994,"Rob Minkoff, Roger Allers",2021-11-10,3.5,7
Mad Max: Fury Road,2015,George Miller,2021-11-10,5.0,10
2 changes: 2 additions & 0 deletions testdata/expected-en.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
Title,Year,Directors,WatchedDate,Rating,Rating10
Unbelievable,2019,"Susannah Grant, Michael Chabon",2024-05-29,3.0,6
Down Among the Big Boys,1993,Charles Gormley,2024-05-29,0.5,1
Fleabag,2016,"Phoebe Waller-Bridge, Harry Bradbeer, Tim Kirkby",2024-05-29,5.0,10
Attack on Titan: Chronicle,2020,"Masashi Koizuka, Tetsurō Araki",2022-01-03,4.0,8
Full Metal Jacket,1987,Stanley Kubrick,2021-11-10,2.0,4
Expand Down
5 changes: 5 additions & 0 deletions testdata/expected-es-ignore.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Title,Year,Directors,WatchedDate,Rating,Rating10
Shingeki no Kyojin: Chronicle,2020,"Masashi Koizuka, Tetsurō Araki",2022-01-04,4.0,8
La chaqueta metálica,1987,Stanley Kubrick,2021-11-11,2.0,4
El rey león,1994,"Rob Minkoff, Roger Allers",2021-11-11,3.5,7
Mad Max: Furia en la carretera,2015,George Miller,2021-11-11,5.0,10
2 changes: 2 additions & 0 deletions testdata/expected-es.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
Title,Year,Directors,WatchedDate,Rating,Rating10
Creedme,2019,"Susannah Grant, Michael Chabon",2024-05-30,3.0,6
Down Among the Big Boys,1993,Charles Gormley,2024-05-30,0.5,1
Fleabag,2016,"Phoebe Waller-Bridge, Harry Bradbeer, Tim Kirkby",2024-05-30,5.0,10
Shingeki no Kyojin: Chronicle,2020,"Masashi Koizuka, Tetsurō Araki",2022-01-04,4.0,8
La chaqueta metálica,1987,Stanley Kubrick,2021-11-11,2.0,4
Expand Down
Loading