Skip to content

Commit

Permalink
update statistics
Browse files Browse the repository at this point in the history
  • Loading branch information
SaulLu committed Feb 2, 2022
1 parent c388e47 commit 19c004e
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 13 deletions.
23 changes: 14 additions & 9 deletions dashboard/python_scripts/compute_stats.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,18 @@
import os
import json
import logging
import subprocess
import sys
from argparse import ArgumentParser
from pathlib import Path
from statistics import mean

import datasets
from bs4 import BeautifulSoup
from bs4.dammit import EncodingDetector
from datasets import config, load_from_disk
from datasets.utils.logging import set_verbosity_info

set_verbosity_info()
logger = logging.getLogger(__name__)

# For `soup.decode_content` that can hit the limit
sys.setrecursionlimit(10000)


def get_args():
parser = ArgumentParser()
Expand Down Expand Up @@ -59,6 +54,11 @@ def main():
args = get_args()
logger.info(f"** The job is runned with the following arguments: **\n{args}\n **** ")

if os.path.isfile(args.save_path_stats_json):
logger.info(f" --- Statistics already computed for seed id {args.seed_id} ")
return

logger.info(f" --- Statistics not already computed for seed id {args.seed_id} ")
if not args.use_datasets_caching:
datasets.set_caching_enabled(False)
else:
Expand Down Expand Up @@ -92,8 +92,10 @@ def main():

ds_html = splits[selected_mime_types[0]]

logger.info(f"the currents splits are {data_stats}.")

def get_length_text(example):
example["length_text"] = len(example["text"])
example["length_text"] = len(example["text"]) if example["text"] is not None else 0
return example

cols_to_remove = [col for col in ds.column_names if col not in ["content_languages", "url_host_tld"]]
Expand All @@ -105,7 +107,9 @@ def get_length_text(example):
)

data_stats["html_empty_text"] = len([e for e in ds_html["length_text"] if e == 0])
data_stats["html_mean_length_non_empty_text"] = mean([e for e in ds_html["length_text"] if e != 0])

non_empty_texts = [e for e in ds_html["length_text"] if e != 0]
data_stats["html_mean_length_non_empty_text"] = mean(non_empty_texts) if non_empty_texts != [] else None
data_stats["seed_id"] = args.seed_id

logger.info(f"There is {data_stats['html_empty_text']} empty text rows out of {len(ds_html)} rows.")
Expand All @@ -119,7 +123,8 @@ def get_length_text(example):
subprocess.run(["mv", save_path_tmp, str(save_path.absolute())])

save_path = Path(args.save_path_stats_full_json)
save_path_tmp = f"{str(save_path.absolute())}.tmp"
tmp_file_name = f"tmp-{str(save_path.name)}"
save_path_tmp = os.path.join(save_path.parent, tmp_file_name)
logger.info(f"Saving the dataset at {save_path_tmp}")
ds_html.to_json(
save_path_tmp,
Expand Down
9 changes: 5 additions & 4 deletions dashboard/slurm_scripts/compute_stats_on_pseudo_crawl.slurm
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
#!/bin/bash
#SBATCH --job-name=pseudo_crawl_compute_stats
#SBATCH --job-name=pseudo_crawl_compute_stats_v5
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
#SBATCH --cpus-per-task=4 # number of cores per tasks
#SBATCH --hint=nomultithread # we get physical cores not logical
#SBATCH --partition=cpu_p1
#SBATCH --time 10:00:00 # maximum execution time (HH:MM:SS)
#SBATCH --output=/gpfswork/rech/six/uty16tp/code/big_science/logs/compute_stats/%x-%j.out # output file name
#SBATCH --output=/gpfswork/rech/six/uty16tp/code/big_science/logs/compute_stats_v5/%x-%j.out # output file name
#SBATCH --array=1-604
#SBATCH --account=six@cpu

Expand All @@ -26,9 +26,10 @@ echo "Computing stats on seed id ${SEED_ID}"

DATASET_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/datasets-seeds/bigscience-catalogue-data/pseudo_crawl_seed--seed-id--"$SEED_ID"
SAVE_STATS_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/datasets-stats/bigscience-catalogue-data/seed_id="$SEED_ID"/stats.json
SAVE_STATS_PATH_FULL=$six_ALL_CCFRSCRATCH/pseudo_crawl/datasets-stats/bigscience-catalogue-data/seed_id="$SEED_ID"/full
SAVE_STATS_PATH_DIR=$six_ALL_CCFRSCRATCH/pseudo_crawl/datasets-stats/bigscience-catalogue-data/seed_id="$SEED_ID"/full
SAVE_STATS_PATH_FULL=$SAVE_STATS_PATH_DIR/full.jsonl.gz

mkdir -p $SAVE_STATS_PATH_FULL
mkdir -p $SAVE_STATS_PATH_DIR

export HF_DATASETS_OFFLINE=1
export HF_DATASETS_CACHE=$SCRATCH/to_delete
Expand Down

0 comments on commit 19c004e

Please sign in to comment.