Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pc 33771/clean ean inside offer titles books cd vinyles only #15878

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 22 additions & 22 deletions .github/workflows/dev_on_workflow_deploy_pullrequests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -265,26 +265,26 @@ jobs:
argocd_app_wait: true
prune: true

- name: "Run sandbox"
run: |
set -e
VERSION="${{ inputs.app_version }}"
export ENVIRONMENT="${{ inputs.environment }}"
IMAGE="europe-west1-docker.pkg.dev/passculture-infra-prod/pass-culture-artifact-registry/pcapi"
export IMAGE="${IMAGE}:${VERSION}"
export PRID=${{ github.event.pull_request.number }}
#- name: "Run sandbox"
#run: |
#set -e
#VERSION="${{ inputs.app_version }}"
#export ENVIRONMENT="${{ inputs.environment }}"
#IMAGE="europe-west1-docker.pkg.dev/passculture-infra-prod/pass-culture-artifact-registry/pcapi"
#export IMAGE="${IMAGE}:${VERSION}"
#export PRID=${{ github.event.pull_request.number }}

echo "=== exec: flask sandbox -n industrial"
export DATE=$(date +"%Y-%m-%d-%H-%M-%S")
start=$(date +%s.%N)
envsubst < .github/workflows/templates/pullrequest-db-init-05.yaml | kubectl -n pcapi-pr-${{ github.event.pull_request.number }} apply -f -
while true;
do
kubectl logs -n pcapi-pr-${{ github.event.pull_request.number }} -f jobs/pullrequest-db-init-05-${DATE} && break
done
kubectl wait --for=condition=complete --timeout=1800s -n pcapi-pr-${{ github.event.pull_request.number }} jobs/pullrequest-db-init-05-${DATE}
end=$(date +%s.%N)
duration=`date -d@$(bc <<< "$end - $start") -u +%H:%M:%S`
{
echo "| flask sandbox -n industrial | $duration |"
} >> $GITHUB_STEP_SUMMARY
#echo "=== exec: flask sandbox -n industrial"
#export DATE=$(date +"%Y-%m-%d-%H-%M-%S")
#start=$(date +%s.%N)
#envsubst < .github/workflows/templates/pullrequest-db-init-05.yaml | kubectl -n pcapi-pr-${{ github.event.pull_request.number }} apply -f -
#while true;
#do
#kubectl logs -n pcapi-pr-${{ github.event.pull_request.number }} -f jobs/pullrequest-db-init-05-${DATE} && break
#done
#kubectl wait --for=condition=complete --timeout=1800s -n pcapi-pr-${{ github.event.pull_request.number }} jobs/pullrequest-db-init-05-${DATE}
#end=$(date +%s.%N)
#duration=`date -d@$(bc <<< "$end - $start") -u +%H:%M:%S`
#{
#echo "| flask sandbox -n industrial | $duration |"
#} >> $GITHUB_STEP_SUMMARY
204 changes: 204 additions & 0 deletions api/src/pcapi/scripts/clean_offer_titles_with_eans/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
from dataclasses import dataclass
from datetime import datetime
from datetime import timezone as tz
import functools
import logging
from typing import Collection

import sqlalchemy as sa

from pcapi.core.bookings import api as bookings_api
from pcapi.core.mails import transactional as transactional_mails
from pcapi.core.offers.models import GcuCompatibilityType
from pcapi.core.offers.models import Offer
from pcapi.core.offers.models import OfferValidationStatus
from pcapi.flask_app import app
from pcapi.models import db
from pcapi.models.offer_mixin import OfferValidationType
from pcapi.repository import atomic
from pcapi.repository import on_commit
from pcapi.utils.chunks import get_chunks


logger = logging.getLogger(__name__)

# Mandatory since this module uses atomic() which needs an application context.
app.app_context().push()


BOOKS_CDS_VINYLES_QUERY = """
SELECT
offer_sub_query.id,
offer_sub_query.ean,
offer_sub_query.name,
offer_sub_query."subcategoryId",
offer_sub_query."isActive",
product.id is not null as exists,
product.id as product_id,
product.name as product_name,
product."jsonData" as product_json_data,
product."gcuCompatibilityType"
FROM (
SELECT
id,
substring("name" similar '%#"[[:digit:]]{13}#"%' escape '#') as ean,
name,
"subcategoryId",
"isActive"
FROM
offer
WHERE
"name" similar to '%\\d{13}%'
tcoudray-pass marked this conversation as resolved.
Show resolved Hide resolved
and "validation" != 'REJECTED'
and "subcategoryId" in (
'LIVRE_PAPIER',
'SUPPORT_PHYSIQUE_MUSIQUE_CD',
'SUPPORT_PHYSIQUE_MUSIQUE_VINYLE'
)
LIMIT
1000
) offer_sub_query
LEFT JOIN
product on product."jsonData"->>'ean' = offer_sub_query.ean
LIMIT
1000
"""


@dataclass(frozen=True)
class OfferEanQueryRow:
id: int
ean: str
name: str
subcategory: str
is_active: bool
exists: bool
product_id: int | None
product_name: str | None
product_json_data: dict | None
gcu_compatibility: str | None


def get_offers_with_ean_inside_title() -> Collection[OfferEanQueryRow]:
query = sa.text(BOOKS_CDS_VINYLES_QUERY)
rows = []
for row in db.session.execute(query):
rows.append(
OfferEanQueryRow(
id=row[0],
ean=row[1],
name=row[2],
subcategory=row[3],
is_active=row[4],
exists=row[5],
product_id=row[6],
product_name=row[7],
product_json_data=row[8],
gcu_compatibility=row[9],
)
)

return rows


def run() -> None:
count = 0

while True:
print(f"start loop #{count}...")
rows = get_offers_with_ean_inside_title()
if not rows:
break

parse_offers(rows)
count += 1


def parse_offers(rows: Collection[OfferEanQueryRow]) -> None:
for idx, chunk in enumerate(get_chunks(rows, chunk_size=100)):
print(f"[parse offers][{idx}]...")

unknown_offer_rows = []
gcu_incompatible_offer_rows = []
legit_offer_rows = []

for offer_row in chunk:
if not offer_row.exists:
unknown_offer_rows.append(offer_row)
elif offer_row.gcu_compatibility != GcuCompatibilityType.COMPATIBLE.value:
gcu_incompatible_offer_rows.append(offer_row)
else:
legit_offer_rows.append(offer_row)

reject_offers(unknown_offer_rows)
reject_offers(gcu_incompatible_offer_rows)
update_legit_offers(legit_offer_rows)

print(f"[parse offers][{idx}]...done: {len(chunk)} offers.")


@atomic()
def update_legit_offers(offer_rows: Collection[OfferEanQueryRow]) -> None:
ids = {row.id for row in offer_rows}
legit_offers = Offer.query.filter(Offer.id.in_(ids))

offer_to_product = {row.id: row for row in offer_rows}

for offer in legit_offers:
offer.name = offer_to_product[offer.id].product_name

if offer_to_product[offer.id].product_json_data:
offer.extraData = offer_to_product[offer.id].product_json_data


@atomic()
def reject_offers(offer_rows: Collection[OfferEanQueryRow]) -> None:
def cancel_booking(offer: Offer) -> None:
cancelled_bookings = bookings_api.cancel_bookings_from_rejected_offer(offer)
for booking in cancelled_bookings:
transactional_mails.send_booking_cancellation_by_pro_to_beneficiary_email(
booking, rejected_by_fraud_action=True
)

def notify_offerer(offer: Offer) -> None:
if offer.venue.bookingEmail:
recipients = [offer.venue.bookingEmail]
tcoudray-pass marked this conversation as resolved.
Show resolved Hide resolved
else:
recipients = [recipient.user.email for recipient in offer.venue.managingOfferer.UserOfferers]

offer_data = transactional_mails.get_email_data_from_offer(
offer, offer.validation, OfferValidationStatus.REJECTED
)
on_commit(
functools.partial(
transactional_mails.send_offer_validation_status_update_email,
offer_data,
recipients,
)
)

ids = {row.id for row in offer_rows}
base_query = Offer.query.filter(
Offer.id.in_(ids),
Offer.status != OfferValidationStatus.REJECTED.value,
)

for offer in base_query:
cancel_booking(offer)
notify_offerer(offer)

base_query.update(
{
"validation": OfferValidationStatus.REJECTED.value,
"lastValidationDate": datetime.now(tz.utc), # pylint: disable=datetime-now
"lastValidationType": OfferValidationType.AUTO.value,
"lastValidationAuthorUserId": None,
"isActive": False,
},
synchronize_session=False,
)
tcoudray-pass marked this conversation as resolved.
Show resolved Hide resolved


if __name__ == "__main__":
app.app_context().push()
run()
5 changes: 5 additions & 0 deletions api/src/pcapi/utils/chunks.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@ def get_chunks(input_data: typing.Iterable[T], chunk_size: int) -> typing.Genera
Build chunks of `chunk_size` max from an iterable.
eg. get_chuks([1, 2, 3], 2) -> ([1, 2], [2])
"""
# Avoid infinite loop: len([]) will be < to 1 (chunk_size)
if chunk_size < 1:
chunk_size = 1
input_data = (_ for _ in [])

if not inspect.isgenerator(input_data):
# if `input_data` is not a generator, the while loop will not
# consume anything and always get the same first items from
Expand Down
Loading
Loading