From a6e98ddf8742de15a2106ef91d92980b8fb29355 Mon Sep 17 00:00:00 2001 From: jzl18thu Date: Sat, 14 Sep 2024 15:21:16 +0800 Subject: [PATCH] fix(tpc): update download method of tpch data (#448) * fix(tpc): update download method of tpch data * recover other tests --------- Co-authored-by: Yuqing Zhu --- .github/scripts/tpch/download_tpch_data.sh | 49 ----- .github/workflows/tpc-h.yml | 6 +- .../test/resources/tpch/thu_cloud_download.py | 187 ------------------ 3 files changed, 4 insertions(+), 238 deletions(-) delete mode 100644 .github/scripts/tpch/download_tpch_data.sh delete mode 100644 test/src/test/resources/tpch/thu_cloud_download.py diff --git a/.github/scripts/tpch/download_tpch_data.sh b/.github/scripts/tpch/download_tpch_data.sh deleted file mode 100644 index e6707b1c84..0000000000 --- a/.github/scripts/tpch/download_tpch_data.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash -# -# IGinX - the polystore system with high performance -# Copyright (C) Tsinghua University -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -# - - -if [ "$RUNNER_OS" = "Windows" ]; then - python test/src/test/resources/tpch/thu_cloud_download.py \ - -l https://cloud.tsinghua.edu.cn/d/740c158819bc4759a36e/ \ - -s "." -else - python3 test/src/test/resources/tpch/thu_cloud_download.py \ - -l https://cloud.tsinghua.edu.cn/d/740c158819bc4759a36e/ \ - -s "." -fi - cd tpchdata - # 目标文件夹路径 - destination_folder="../tpc/TPC-H V3.0.1/data" - - # 确保目标文件夹存在,如果不存在则创建 - mkdir -p "$destination_folder" - - # 将所有*.tbl文件移动到目标文件夹 - mv *.tbl "$destination_folder/" - cd "$destination_folder" - - chmod +r customer.tbl - chmod +r lineitem.tbl - chmod +r nation.tbl - chmod +r orders.tbl - chmod +r region.tbl - chmod +r supplier.tbl - ls -a - pwd - echo "文件移动完成" diff --git a/.github/workflows/tpc-h.yml b/.github/workflows/tpc-h.yml index 9be9a97f42..c2ceabd0de 100644 --- a/.github/workflows/tpc-h.yml +++ b/.github/workflows/tpc-h.yml @@ -78,8 +78,10 @@ jobs: - name: Download TPC-H Data shell: bash run: | - chmod +x "${GITHUB_WORKSPACE}/.github/scripts/tpch/download_tpch_data.sh" - "${GITHUB_WORKSPACE}/.github/scripts/tpch/download_tpch_data.sh" + wget https://github.com/IGinX-THU/IGinX-resources/raw/main/resources/tpc.7z + sudo apt-get install p7zip-full + 7za x tpc.7z + ls tpc - name: Run ZooKeeper uses: ./.github/actions/zookeeperRunner diff --git a/test/src/test/resources/tpch/thu_cloud_download.py b/test/src/test/resources/tpch/thu_cloud_download.py deleted file mode 100644 index 474e9b9dd4..0000000000 --- a/test/src/test/resources/tpch/thu_cloud_download.py +++ /dev/null @@ -1,187 +0,0 @@ -# -# IGinX - the polystore system with high performance -# Copyright (C) Tsinghua University -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -# - -import os -import re -import logging -import fnmatch -import requests -import argparse -import urllib.parse -from tqdm import tqdm - - -sess = requests.Session() -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') - - -def parse_args(): - args = argparse.ArgumentParser() - args.add_argument('-l', '--link', type=str, required=True, help='Share link of Tsinghua Cloud') - args.add_argument('-s', '--save_dir', type=str, default=None, help='Path to save the files. Default: Desktop') - args.add_argument('-f', '--file', type=str, default=None, help='Regex to match the file path') - return args.parse_args() - - -def get_share_key(url: str) -> str: - prefix = 'https://cloud.tsinghua.edu.cn/d/' - if not url.startswith(prefix): - raise ValueError('Share link of Tsinghua Cloud should start with {}'.format(prefix)) - share_key = url[len(prefix):].replace('/', '') - logging.info('Share key: {}'.format(share_key)) - return share_key - - -def get_root_dir(share_key: str) -> str: - # Aquire the root directory name of the share link, - # run after verify_password function - global sess - pattern = '' - r = sess.get(f"https://cloud.tsinghua.edu.cn/d/{share_key}/") - root_dir = re.findall(pattern, r.text) - assert root_dir is not None, "Couldn't find title of the share link." - logging.info("Root directory name: {}".format(root_dir[0])) - return root_dir[0] - - -def verify_password(share_key: str) -> None: - # Require password if the share link is password-protected, - # and verify the password provided by the user. - global sess - r = sess.get(f"https://cloud.tsinghua.edu.cn/d/{share_key}/") - pattern = '' - csrfmiddlewaretoken = re.findall(pattern, r.text) - if csrfmiddlewaretoken: - pwd = input("Please enter the password: ") - - csrfmiddlewaretoken = csrfmiddlewaretoken[0] - data = { - "csrfmiddlewaretoken": csrfmiddlewaretoken, - "token": share_key, - "password": pwd - } - r = sess.post(f"https://cloud.tsinghua.edu.cn/d/{share_key}/", data=data, - headers={"Referer": f"https://cloud.tsinghua.edu.cn/d/{share_key}/"}) - if "Please enter a correct password" in r.text: - raise ValueError("Wrong password.") - - -def is_match(file_path: str, pattern: str) -> bool: - # judge if the file path matches the regex provided by the user - file_path = file_path[1:] # remove the first '/' - return pattern is None or fnmatch.fnmatch(file_path, pattern) - - -def dfs_search_files(share_key: str, - path: str = "/", - pattern: str = None) -> list: - global sess - filelist = [] - encoded_path = urllib.parse.quote(path) - r = sess.get(f'https://cloud.tsinghua.edu.cn/api/v2.1/share-links/{share_key}/dirents/?path={encoded_path}') - objects = r.json()['dirent_list'] - for obj in objects: - if obj["is_dir"]: - filelist.extend( - dfs_search_files(share_key, obj['folder_path'], pattern)) - elif is_match(obj["file_path"], pattern): - filelist.append(obj) - return filelist - - -def download_single_file(url: str, fname: str, pbar: tqdm): - global sess - resp = sess.get(url, stream=True) - with open(fname, 'wb') as file: - for data in resp.iter_content(chunk_size=1024): - size = file.write(data) - pbar.update(size) - - -def print_filelist(filelist): - print("=" * 100) - print("Last Modified Time".ljust(25), " ", "File Size".rjust(10), " ", "File Path") - print("-" * 100) - for i, file in enumerate(filelist, 1): - print(file["last_modified"], " ", str(file["size"]).rjust(10), " ", file["file_path"]) - if i == 100: - print("... %d more files" % (len(filelist) - 100)) - break - print("-" * 100) - - -def download(share_key: str, filelist: list, save_dir: str) -> None: - if os.path.exists(save_dir): - logging.warning("Save directory already exists. Files will be overwritten.") - total_size = sum([file["size"] for file in filelist]) - pbar = tqdm(total=total_size, ncols=120, unit='iB', unit_scale=True, unit_divisor=1024) - for i, file in enumerate(filelist): - file_url = 'https://cloud.tsinghua.edu.cn/d/{}/files/?p={}&dl=1'.format(share_key, file["file_path"]) - save_path = os.path.join(save_dir, file["file_path"][1:]) - os.makedirs(os.path.dirname(save_path), exist_ok=True) - # logging.info("[{}/{}] Downloading File: {}".format(i + 1, len(filelist), save_path)) - try: - pbar.set_description("[{}/{}]".format(i + 1, len(filelist))) - download_single_file(file_url, save_path, pbar) - - except Exception as e: - logging.error("Error happened when downloading file: {}".format(save_path)) - logging.error(e) - pbar.close() - logging.info("Download finished.") - - - -def main(): - args = parse_args() - url, pattern, save_dir = args.link, args.file, args.save_dir - share_key = get_share_key(url) - verify_password(share_key) - - # search files - logging.info("Searching for files to be downloaded, Wait a moment...") - filelist = dfs_search_files(share_key, pattern=pattern) - filelist.sort(key=lambda x: x["file_path"]) - if not filelist: - logging.info("No file found.") - return - - print_filelist(filelist) - total_size = sum([file["size"] for file in filelist]) / 1024 / 1024 # MB - logging.info(f"# Files: {len(filelist)}. Total size: {total_size: .1f} MB.") - - # Save to desktop by default. - if save_dir is None: - save_dir = os.path.join(os.path.expanduser("~"), 'Desktop') - assert os.path.exists(save_dir), "Desktop folder not found." - root_dir = get_root_dir(share_key) - save_dir = os.path.join(save_dir, root_dir) - - download(share_key, filelist, save_dir) - - - -if __name__ == '__main__': - """ - 用法: - python main.py \ - -l https://cloud.tsinghua.edu.cn/d/1234567890/ \ - -s "~/path_to_save" \ - -f "*.pptx?" (regex, 正则表达式) \ - """ - main() \ No newline at end of file