From d9d77781212e25440bf0e6660248c362514230f0 Mon Sep 17 00:00:00 2001 From: nattjn Date: Wed, 19 Jul 2023 21:22:37 +0700 Subject: [PATCH 1/6] Add functions to crawls news in mfa --- .../web_crawls_mfa/crawl_news.py | 66 +++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 src/data/openthaigpt_pretraining_data/web_crawls_mfa/crawl_news.py diff --git a/src/data/openthaigpt_pretraining_data/web_crawls_mfa/crawl_news.py b/src/data/openthaigpt_pretraining_data/web_crawls_mfa/crawl_news.py new file mode 100644 index 00000000..83010022 --- /dev/null +++ b/src/data/openthaigpt_pretraining_data/web_crawls_mfa/crawl_news.py @@ -0,0 +1,66 @@ +import requests +import time +from openthaigpt_pretraining_data.web_crawls_mfa.crawl_gov_achievements import ( + process_response, + process_info, +) + +ROOT = "https://www.mfa.go.th" +DIV_TAG = "div" +P_TAG = "p" +A_TAG = "a" +DATE_CLASS = "date" +INFO_CLASS = "p-3 col-md-4" +DETAIL_CLASS = "ContentDetailstyled__ContentDescription-sc-150bmwg-4 jWrYsI mb-3" + + +def get_title_date(cur_url, page_no, time_delay): + """ + Description: + Get data processed by the function process_response. + Args: + cur_url: The desired URL to be used as a root. + page_no: The total number of pages. + time_delay: Delay before another request (in second). + Returns: + news_list: A list containing titles and dates. + """ + news_list = [] + + for page in range(1, page_no + 1): + url = f"{cur_url}&p={page}" + res = requests.get(url) + res.encoding = "utf-8" + + if res.status_code == 200: + processed_data = process_response(res.text, time_delay) + news_list.extend(processed_data) + + time.sleep(0.5) + + return news_list + + +def get_info(cur_url, page_no, time_delay): + """ + Description: + get data inside a link for every pafe + Args: + desired url and total of pages. + Returns: + info_list contains details of the news + """ + info_list = [] + + for page in range(1, page_no + 1): + url = f"{cur_url}&p={page}" + res = requests.get(url) + res.encoding = "utf-8" + + if res.status_code == 200: + processed_info = process_info(res.text, time_delay) + info_list.extend(processed_info) + + time.sleep(0.5) + + return info_list From 0e6bf206fb31d98405218032036d6ffccde2ebfb Mon Sep 17 00:00:00 2001 From: nattjn Date: Wed, 19 Jul 2023 21:24:11 +0700 Subject: [PATCH 2/6] Add script to crawl embassy consulate contents --- .../scripts/crawl_mfa/embassy_cosulate.py | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 src/data/scripts/crawl_mfa/embassy_cosulate.py diff --git a/src/data/scripts/crawl_mfa/embassy_cosulate.py b/src/data/scripts/crawl_mfa/embassy_cosulate.py new file mode 100644 index 00000000..a6a3e725 --- /dev/null +++ b/src/data/scripts/crawl_mfa/embassy_cosulate.py @@ -0,0 +1,21 @@ +import pandas as pd +from datasets import Dataset, load_from_disk +from openthaigpt_pretraining_data.web_crawls_mfa.crawl_news import ( + get_title_date, + get_info, +) + +EMBASSY_CONSULATE_URL = "https://www.mfa.go.th/th/page/%E0%B8%82%E0%B9%88%E0%B8%B2%E0%B8%A7%E0%B8%81%E0%B8%B4%E0%B8%88%E0%B8%81%E0%B8%A3%E0%B8%A3%E0%B8%A1%E0%B8%AA%E0%B8%96%E0%B8%B2%E0%B8%99%E0%B9%80%E0%B8%AD%E0%B8%81%E0%B8%AD%E0%B8%B1%E0%B8%84%E0%B8%A3%E0%B8%A3%E0%B8%B2%E0%B8%8A%E0%B8%97%E0%B8%B9%E0%B8%95%E0%B9%81%E0%B8%A5%E0%B8%B0%E0%B8%AA%E0%B8%96%E0%B8%B2%E0%B8%99%E0%B8%81%E0%B8%87%E0%B8%AA%E0%B8%B8%E0%B8%A5%E0%B9%83%E0%B8%AB%E0%B8%8D%E0%B9%88?menu=5f2110a3c1d7dc1b17651cb2" + +news_title_date = get_title_date(cur_url=EMBASSY_CONSULATE_URL, page_no=501) +news_details = get_info(cur_url=EMBASSY_CONSULATE_URL, page_no=501) + +for i, data_dict in enumerate(news_title_date): + if i < len(news_details): + data_dict.update({"detail": news_details[i]}) + +all_news = pd.DataFrame(news_title_date) +dataset = Dataset.from_pandas(all_news) +dataset.save_to_disk("mfa_embassy_consulate.arrow") +loaded_dataset = load_from_disk("mfa_embassy_consulate.arrow") + From 5ad7a5a66980431189564c3dc3c5a2c23b9f3af9 Mon Sep 17 00:00:00 2001 From: nattjn Date: Wed, 19 Jul 2023 21:24:42 +0700 Subject: [PATCH 3/6] Add script to crawl other news --- src/data/scripts/crawl_mfa/other_news.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 src/data/scripts/crawl_mfa/other_news.py diff --git a/src/data/scripts/crawl_mfa/other_news.py b/src/data/scripts/crawl_mfa/other_news.py new file mode 100644 index 00000000..3f822d32 --- /dev/null +++ b/src/data/scripts/crawl_mfa/other_news.py @@ -0,0 +1,20 @@ +import pandas as pd +from datasets import Dataset, load_from_disk +from openthaigpt_pretraining_data.web_crawls_mfa.crawl_news import ( + get_title_date, + get_info, +) + +OTHER_NEWS_URL = "https://www.mfa.go.th/th/page/%E0%B8%82%E0%B9%88%E0%B8%B2%E0%B8%A7%E0%B8%AD%E0%B8%B7%E0%B9%88%E0%B8%99%E0%B9%86?menu=5d5bd3d815e39c306002aac7" + +news_title_date = get_title_date(cur_url=OTHER_NEWS_URL, page_no=10) +news_details = get_info(cur_url=OTHER_NEWS_URL, page_no=10) + +for i, data_dict in enumerate(news_title_date): + if i < len(news_details): + data_dict.update({"detail": news_details[i]}) + +all_news = pd.DataFrame(news_title_date) +dataset = Dataset.from_pandas(all_news) +dataset.save_to_disk("mfa_other_news.arrow") +loaded_dataset = load_from_disk("mfa_other_news.arrow") \ No newline at end of file From c31e35aec1590a3396a58a11f95d02eae6c148f9 Mon Sep 17 00:00:00 2001 From: nattjn Date: Wed, 19 Jul 2023 21:25:24 +0700 Subject: [PATCH 4/6] Add script to crawl press release --- src/data/scripts/crawl_mfa/press_release.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 src/data/scripts/crawl_mfa/press_release.py diff --git a/src/data/scripts/crawl_mfa/press_release.py b/src/data/scripts/crawl_mfa/press_release.py new file mode 100644 index 00000000..5fa7849c --- /dev/null +++ b/src/data/scripts/crawl_mfa/press_release.py @@ -0,0 +1,20 @@ +import pandas as pd +from datasets import Dataset, load_from_disk +from openthaigpt_pretraining_data.web_crawls_mfa.crawl_news import ( + get_title_date, + get_info, +) + +PRESS_RELEASE_URL = "https://www.mfa.go.th/th/page/%E0%B8%82%E0%B9%88%E0%B8%B2%E0%B8%A7%E0%B8%AA%E0%B8%B2%E0%B8%A3%E0%B8%99%E0%B8%B4%E0%B9%80%E0%B8%97%E0%B8%A8?menu=5d5bd3d815e39c306002aac5" + +news_title_date = get_title_date(cur_url=PRESS_RELEASE_URL, page_no=313) +news_details = get_info(cur_url=PRESS_RELEASE_URL, page_no=313) + +for i, data_dict in enumerate(news_title_date): + if i < len(news_details): + data_dict.update({"detail": news_details[i]}) + +all_news = pd.DataFrame(news_title_date) +dataset = Dataset.from_pandas(all_news) +dataset.save_to_disk("mfa_press_release.arrow") +loaded_dataset = load_from_disk("mfa_press_release.arrow") From 704400ce4ad30fa7710d270ee74780e3d234ef06 Mon Sep 17 00:00:00 2001 From: nattjn Date: Wed, 19 Jul 2023 21:25:52 +0700 Subject: [PATCH 5/6] Add script to crawl speeches --- src/data/scripts/crawl_mfa/speeches.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 src/data/scripts/crawl_mfa/speeches.py diff --git a/src/data/scripts/crawl_mfa/speeches.py b/src/data/scripts/crawl_mfa/speeches.py new file mode 100644 index 00000000..a94ea244 --- /dev/null +++ b/src/data/scripts/crawl_mfa/speeches.py @@ -0,0 +1,21 @@ +import pandas as pd +from datasets import Dataset, load_from_disk +from openthaigpt_pretraining_data.web_crawls_mfa.crawl_news import ( + get_title_date, + get_info, +) + +SPEECHES_URL = "https://www.mfa.go.th/th/page/%E0%B8%AA%E0%B8%B8%E0%B8%99%E0%B8%97%E0%B8%A3%E0%B8%9E%E0%B8%88%E0%B8%99%E0%B9%8C?menu=5d5bd3d815e39c306002aacd" + +news_title_date = get_title_date(cur_url=SPEECHES_URL, page_no=8) +news_details = get_info(cur_url=SPEECHES_URL, page_no=8) + +for i, data_dict in enumerate(news_title_date): + if i < len(news_details): + data_dict.update({"detail": news_details[i]}) + +all_news = pd.DataFrame(news_title_date) +dataset = Dataset.from_pandas(all_news) +dataset.save_to_disk("MFA_speeches.arrow") +loaded_dataset = load_from_disk("MFA_speeches.arrow") + From 78211aa9d14153cba5d45e074d9de936d0e52036 Mon Sep 17 00:00:00 2001 From: nattjn Date: Wed, 19 Jul 2023 21:26:19 +0700 Subject: [PATCH 6/6] Add script to crawl top stories --- src/data/scripts/crawl_mfa/top_news.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 src/data/scripts/crawl_mfa/top_news.py diff --git a/src/data/scripts/crawl_mfa/top_news.py b/src/data/scripts/crawl_mfa/top_news.py new file mode 100644 index 00000000..8be0cef8 --- /dev/null +++ b/src/data/scripts/crawl_mfa/top_news.py @@ -0,0 +1,20 @@ +import pandas as pd +from datasets import Dataset, load_from_disk +from openthaigpt_pretraining_data.web_crawls_mfa.crawl_news import ( + get_title_date, + get_info, +) + +TOP_STORIES_URL = "https://www.mfa.go.th/th/page/%E0%B8%82%E0%B9%88%E0%B8%B2%E0%B8%A7%E0%B9%80%E0%B8%94%E0%B9%88%E0%B8%99?menu=5d5bd3d815e39c306002aac4" + +news_title_date = get_title_date(cur_url=TOP_STORIES_URL, page_no=216) +news_details = get_info(cur_url=TOP_STORIES_URL, page_no=216) + +for i, data_dict in enumerate(news_title_date): + if i < len(news_details): + data_dict.update({"detail": news_details[i]}) + +all_news = pd.DataFrame(news_title_date) +dataset = Dataset.from_pandas(all_news) +dataset.save_to_disk("MFA_top_news.arrow") +loaded_dataset = load_from_disk("MFA_top_news.arrow") \ No newline at end of file