Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Draft] Framework for exporting data #634

Draft
wants to merge 12 commits into
base: main
Choose a base branch
from
62 changes: 62 additions & 0 deletions ami/exports/all_captures.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import logging

from django.db import models
from django.db.models.functions import TruncDate, TruncTime
from rest_framework import serializers

from ami.main.models import SourceImage

logger = logging.getLogger(__name__)


class CapturesTabularSerializer(serializers.Serializer):
capture_id = serializers.IntegerField(source="id")
latitude = serializers.FloatField()
longitude = serializers.FloatField()
datetime_observed = serializers.DateTimeField()
date_observed = serializers.DateField()
time_observed = serializers.TimeField()
session_id = serializers.IntegerField()
session_start_datetime = serializers.DateTimeField()
session_start_date = serializers.DateField()
session_start_time = serializers.TimeField()
session_end_datetime = serializers.DateTimeField()
session_end_date = serializers.DateField()
session_end_time = serializers.TimeField()
session_duration = serializers.DurationField()
station_name = serializers.CharField()
station_id = serializers.IntegerField()
device_id = serializers.IntegerField()
device_name = serializers.CharField()
detections_count = serializers.IntegerField(source="detections_count_fresh")
occurrences_count = serializers.IntegerField()
taxa_count = serializers.IntegerField()


def get_queryset():
return (
SourceImage.objects.all()
.annotate(
datetime_observed=models.F("timestamp"),
date_observed=TruncDate("timestamp"),
time_observed=TruncTime("timestamp"),
latitude=models.F("deployment__latitude"),
longitude=models.F("deployment__longitude"),
session_id=models.F("event_id"),
session_start_datetime=models.F("event__start"),
session_start_date=TruncDate("event__start"),
session_start_time=TruncTime("event__start"),
session_end_datetime=models.F("event__end"),
session_end_date=TruncDate("event__end"),
session_end_time=TruncTime("event__end"),
session_duration=models.F("event__end") - models.F("event__start"),
station_name=models.F("deployment__name"),
station_id=models.F("deployment_id"),
device_id=models.F("deployment__device_id"),
device_name=models.F("deployment__device__name"),
detections_count_fresh=models.Count("detections", distinct=True),
occurrences_count=models.Count("detections__occurrence", distinct=True),
taxa_count=models.Count("detections__occurrence__determination", distinct=True),
)
.order_by("datetime_observed")
)
57 changes: 57 additions & 0 deletions ami/exports/all_sessions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import logging

from django.db import models
from django.db.models.functions import TruncDate, TruncTime
from rest_framework import serializers

from ami.main.models import Event

logger = logging.getLogger(__name__)


class SessionsTabularSerializer(serializers.Serializer):
session_id = serializers.IntegerField(source="id")
session_start_datetime = serializers.DateTimeField()
session_start_date = serializers.DateField()
session_start_time = serializers.TimeField()
session_end_datetime = serializers.DateTimeField()
session_end_date = serializers.DateField()
session_end_time = serializers.TimeField()
session_duration = serializers.DurationField()
latitude = serializers.FloatField()
longitude = serializers.FloatField()
station_name = serializers.CharField()
station_id = serializers.IntegerField()
device_id = serializers.IntegerField()
device_name = serializers.CharField()
captures_count = serializers.IntegerField(source="captures_count_fresh")
detections_count = serializers.IntegerField(source="detections_count_fresh")
occurrences_count = serializers.IntegerField()
taxa_count = serializers.IntegerField()


def get_queryset():
return (
Event.objects.all()
.annotate(
session_id=models.F("id"),
session_start_datetime=models.F("start"),
session_start_date=TruncDate("start"),
session_start_time=TruncTime("start"),
session_end_datetime=models.F("end"),
session_end_date=TruncDate("end"),
session_end_time=TruncTime("end"),
session_duration=models.F("end") - models.F("start"),
latitude=models.F("deployment__latitude"),
longitude=models.F("deployment__longitude"),
station_name=models.F("deployment__name"),
station_id=models.F("deployment_id"),
device_id=models.F("deployment__device_id"),
device_name=models.F("deployment__device__name"),
captures_count_fresh=models.Count("captures", distinct=True),
detections_count_fresh=models.Count("captures__detections", distinct=True),
occurrences_count_fresh=models.Count("captures__detections__occurrence", distinct=True),
taxa_count=models.Count("captures__detections__occurrence__determination", distinct=True),
)
.order_by("session_start_datetime", "station_id")
)
7 changes: 7 additions & 0 deletions ami/exports/apps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from django.apps import AppConfig
from django.utils.translation import gettext_lazy as _


class MainConfig(AppConfig):
name = "ami.exports"
verbose_name = _("Data Exports & Reports")
81 changes: 81 additions & 0 deletions ami/exports/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import csv
import logging

from django.core.files.storage import default_storage
from django.db import models
from django.utils import timezone
from django.utils.text import slugify
from rest_framework import serializers
from rest_framework.views import APIView
from tqdm import tqdm

logger = logging.getLogger(__name__)


class BaseExportSerializer(serializers.Serializer):
"""
Base serializer for exporting data in various formats, from multiple models.
"""

pass


class BaseExportView(APIView):
"""
Read-only API view for exporting data in various formats, from multiple models.
"""

pass


def get_data_in_batches(QuerySet: models.QuerySet, Serializer: type[serializers.Serializer], batch_size=1000):
items = QuerySet.iterator(chunk_size=batch_size)
batch = []
for i, item in enumerate(items):
# logger.info(f"Processing item {i}")
try:
# item_data = {
# "user_id": item.id,
# "username": item.username,
# "email": item.email,
# "total_orders": Order.objects.filter(user=item).count(),
# "total_spent": Order.objects.filter(user=item).aggregate(total=Sum("total_amount"))["total"] or 0,
# }
serializer = Serializer(item)
item_data = serializer.data
batch.append(item_data)

if len(batch) >= batch_size:
yield batch
batch = []
except Exception as e:
logger.warning(f"Error processing item {i}: {str(e)}")
raise
if batch:
yield batch


def write_export(report_name, Serializer: type[serializers.Serializer], QuerySet: models.QuerySet):
timestamp = timezone.now().strftime("%Y%m%d-%H%M%S")
file_name = f"{slugify(report_name)}-{timestamp}.csv"
file_path = file_name

try:
with default_storage.open(file_path, "w") as file:
writer = csv.writer(file)
writer.writerow(Serializer().fields.keys()) # Write header

# Calculate total items for progress bar
total_items = QuerySet.count()

with tqdm(total=total_items, desc="Exporting data", unit="items") as pbar:
for batch in get_data_in_batches(Serializer=Serializer, QuerySet=QuerySet):
for item in batch:
writer.writerow(item.values())
pbar.update(1)

logger.info(f"CSV export generated successfully: {file_path}")
return file_path
except Exception as e:
logger.error(f"Error generating CSV export: {str(e)}")
raise
99 changes: 99 additions & 0 deletions ami/exports/by_capture.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import logging
import typing

from django.contrib.postgres.aggregates import ArrayAgg
from django.db import models
from django.db.models.functions import TruncDate, TruncTime
from rest_framework import serializers

from ami.main.models import Detection, Taxon, TaxonRank

logger = logging.getLogger(__name__)


class DetectionsByDeterminationAndCaptureTabularSerializer(serializers.Serializer):
capture_id = serializers.IntegerField(source="source_image_id")
latitude = serializers.FloatField()
longitude = serializers.FloatField()
datetime_observed = serializers.DateTimeField()
date_observed = serializers.DateField()
time_observed = serializers.TimeField()
session_id = serializers.IntegerField()
session_start_datetime = serializers.DateTimeField()
session_start_date = serializers.DateField()
session_start_time = serializers.TimeField()
session_end_datetime = serializers.DateTimeField()
session_end_date = serializers.DateField()
session_end_time = serializers.TimeField()
session_duration = serializers.DurationField()
taxon_id = serializers.IntegerField()
taxon_name = serializers.CharField()
taxon_rank = serializers.CharField()
taxon_count = serializers.IntegerField()
determination_score_max = serializers.FloatField()
detection_ids = serializers.CharField()
occurrence_ids = serializers.CharField()
station_name = serializers.CharField()
station_id = serializers.IntegerField()
device_id = serializers.IntegerField()
device_name = serializers.CharField()

def to_representation(self, instance: typing.Any) -> dict[str, typing.Any]:
data = super().to_representation(instance)
taxon: Taxon = Taxon.objects.get(id=data["taxon_id"])

for taxon_rank in taxon.parents_json:
field_name = f"taxon_{taxon_rank.rank.name.lower()}"
data[field_name] = taxon_rank.name

return data

def get_fields(self):
fields = super().get_fields()
for rank in TaxonRank:
field_name = f"taxon_{rank.name.lower()}"
fields[field_name] = serializers.CharField(required=False)
return fields


def get_queryset():
return (
Detection.objects.all()
.select_related(
"occurrence",
"occurrence__determination",
"source_image",
)
.values(
"source_image_id",
"occurrence__determination_id",
)
.annotate(
capture_id=models.F("source_image_id"),
datetime_observed=models.F("source_image__timestamp"),
date_observed=TruncDate("source_image__timestamp"),
time_observed=TruncTime("source_image__timestamp"),
latitude=models.F("source_image__deployment__latitude"),
longitude=models.F("source_image__deployment__longitude"),
session_id=models.F("source_image__event_id"),
session_start_datetime=models.F("source_image__event__start"),
session_start_date=TruncDate("source_image__event__start"),
session_start_time=TruncTime("source_image__event__start"),
session_end_datetime=models.F("source_image__event__end"),
session_end_date=TruncDate("source_image__event__end"),
session_end_time=TruncTime("source_image__event__end"),
session_duration=models.F("source_image__event__end") - models.F("source_image__event__start"),
station_name=models.F("source_image__deployment__name"),
station_id=models.F("source_image__deployment_id"),
taxon_id=models.F("occurrence__determination_id"),
taxon_name=models.F("occurrence__determination__name"),
taxon_rank=models.F("occurrence__determination__rank"),
determination_score_max=models.Max("occurrence__determination_score"),
taxon_count=models.Count("id"),
detection_ids=ArrayAgg("id"),
occurrence_ids=ArrayAgg("occurrence_id"),
device_id=models.F("source_image__deployment__device_id"),
device_name=models.F("source_image__deployment__device__name"),
)
.order_by("source_image_id", "-taxon_count", "-determination_score_max")
)
Loading
Loading