From 2d782ec71753080c66a988f0c9d2fbf5728ab987 Mon Sep 17 00:00:00 2001 From: ruggsea <55443134+ruggsea@users.noreply.github.com> Date: Wed, 10 May 2023 16:27:46 +0200 Subject: [PATCH 1/5] Added cli version --- googleimagescraper_cli.py | 97 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 googleimagescraper_cli.py diff --git a/googleimagescraper_cli.py b/googleimagescraper_cli.py new file mode 100644 index 00000000..fa3d1ac9 --- /dev/null +++ b/googleimagescraper_cli.py @@ -0,0 +1,97 @@ +import argparse +import concurrent.futures +import os +from patch import webdriver_executable +import concurrent.futures +from GoogleImageScraper import GoogleImageScraper +import sys + + +def worker_thread(search_key): + print("[INFO] Starting worker thread for search key: {}".format(search_key)) + image_scraper = GoogleImageScraper( + webdriver_path, + image_path, + searchname, + search_key, + number_of_images, + headless, + min_resolution, + max_resolution, + max_missed) + print(max_missed) + image_urls = image_scraper.find_image_urls() + image_scraper.save_images(image_urls, keep_filenames) + + + #Release resources + del image_scraper + + +def list_arg(string): + """ + Custom action to convert a string argument to a list. + """ + try: + # Split the string into a list using a comma + lst = string.split(',') + return lst + except: + # Raise an error if the string is not valid + raise argparse.ArgumentTypeError("Invalid list argument: {}".format(string)) + +def tuple_arg(string): + """ + Custom action to convert a string argument to a tuple. + """ + try: + # Split the string into two integers using a comma + x, y = map(int, string.split(',')) + return x, y + except: + # Raise an error if the string is not valid + raise argparse.ArgumentTypeError("Invalid tuple argument: {}".format(string)) + + +if __name__ == "__main__": + #Define file path + webdriver_path = os.path.normpath(os.path.join(os.getcwd(), 'webdriver', webdriver_executable())) + image_path = os.path.normpath(os.path.join(os.getcwd(), 'images')) + + + + #Parameters from argparse + + parser = argparse.ArgumentParser(description='Google Image Scraper CLI Tool', usage='%(prog)s [options]') + parser.add_argument('--number_of_images', type=int, default=5, help='The number of images to download (default: 5)') + parser.add_argument('--headless', type=bool, default=False, help='Whether to run in headless mode (default: False)') + parser.add_argument('--min_resolution', type=tuple_arg, default=(0, 0), help='The minimum desired image resolution (default: (0,0))') + parser.add_argument('--max_resolution', type=tuple_arg, default=(10000, 10000), help='The maximum desired image resolution (default: (10000,10000))') + parser.add_argument('--max_missed', type=int, default=10, help='The maximum number of failed images before exiting (default: 10)') + parser.add_argument('--number_of_workers', type=int, default=1, help='The number of workers to use (default: 1)') + parser.add_argument('--keep_filenames', type=bool, default=False, help='Whether to keep original URL image filenames (default: False)') + parser.add_argument('--searchname', type=str, default='search', help='The name of the subfolder in which images will be saved (default: "search")') + parser.add_argument('--search_keys', type=list_arg, help='The list of search keys to use (default: ["cat","t-shirt"])') + + args = parser.parse_args() + number_of_images = args.number_of_images # Desired number of images + headless = args.headless # True = No Chrome GUI + min_resolution = args.min_resolution # Minimum desired image resolution + max_resolution = args.max_resolution # Maximum desired image resolution + max_missed = args.max_missed # Max number of failed images before exit + number_of_workers = args.number_of_workers # Number of "workers" used + keep_filenames = args.keep_filenames # Keep original URL image filenames + searchname = args.searchname # Name of subfolder in which images will be saved + + search_keys = args.search_keys + if not search_keys: + print("Error: You must provide at least one search key with --search_keys") + sys.exit(1) + + + #Run each search_key in a separate thread + #Automatically waits for all threads to finish + #Removes duplicate strings from search_keys + with concurrent.futures.ThreadPoolExecutor(max_workers=args.number_of_workers) as executor: + print("Starting threads...") + executor.map(worker_thread, search_keys) From cff69b3a96363b4e3544cb45163883cab153d002 Mon Sep 17 00:00:00 2001 From: ruggsea <55443134+ruggsea@users.noreply.github.com> Date: Wed, 10 May 2023 16:32:20 +0200 Subject: [PATCH 2/5] Update README.md to include cli --- README.md | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f906cde7..6c8693e3 100644 --- a/README.md +++ b/README.md @@ -34,8 +34,23 @@ Visit their repo here: https://github.com/JJLimmm/Website-Image-Scraper ``` ## Usage: -This project was created to bypass Google Chrome's new restrictions on web scraping from Google Images. -To use it, define your desired parameters in main.py and run through the command line: +This project was created to bypass Google Chrome's new restrictions on web scraping from Google Images. +In the project repository, run: + +``` +python googleimagescraper_cli.py --search-keys cat +``` +Optional arguments +--number_of_images: The number of images to download (default: 5) +--headless: Whether to run in headless mode (default: False) +--min_resolution: The minimum desired image resolution (default: (0,0)) +--max_resolution: The maximum desired image resolution (default: (10000,10000)) +--max_missed: The maximum number of failed images before exiting (default: 10) +--number_of_workers: The number of workers to use (default: 1) +--keep_filenames: Whether to keep original URL image filenames (default: False) +--searchname: The name of the subfolder in which images will be saved (default: "search") + +Otherwise, define your desired parameters in main.py and run through the command line: ``` python main.py ``` From 6533fbf77ef33f9c57415910b458dc2ee47d564a Mon Sep 17 00:00:00 2001 From: ruggsea <55443134+ruggsea@users.noreply.github.com> Date: Wed, 10 May 2023 16:39:32 +0200 Subject: [PATCH 3/5] Update README.md --- README.md | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 6c8693e3..fd85f620 100644 --- a/README.md +++ b/README.md @@ -40,16 +40,17 @@ In the project repository, run: ``` python googleimagescraper_cli.py --search-keys cat ``` -Optional arguments ---number_of_images: The number of images to download (default: 5) ---headless: Whether to run in headless mode (default: False) ---min_resolution: The minimum desired image resolution (default: (0,0)) ---max_resolution: The maximum desired image resolution (default: (10000,10000)) ---max_missed: The maximum number of failed images before exiting (default: 10) ---number_of_workers: The number of workers to use (default: 1) ---keep_filenames: Whether to keep original URL image filenames (default: False) ---searchname: The name of the subfolder in which images will be saved (default: "search") - +Optional arguments: + ``` + --number_of_images: The number of images to download (default: 5) + --headless: Whether to run in headless mode (default: False) + --min_resolution: The minimum desired image resolution (default: (0,0)) + --max_resolution: The maximum desired image resolution (default: (10000,10000)) + --max_missed: The maximum number of failed images before exiting (default: 10) + --number_of_workers: The number of workers to use (default: 1) + --keep_filenames: Whether to keep original URL image filenames (default: False) + --searchname: The name of the subfolder in which images will be saved (default: "search") + ``` Otherwise, define your desired parameters in main.py and run through the command line: ``` python main.py From 6017a7cd2d4260e34392ff726b4de2535ab7c67b Mon Sep 17 00:00:00 2001 From: ruggsea <55443134+ruggsea@users.noreply.github.com> Date: Wed, 10 May 2023 16:40:12 +0200 Subject: [PATCH 4/5] Update README.md --- README.md | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index fd85f620..1d50a23f 100644 --- a/README.md +++ b/README.md @@ -41,16 +41,18 @@ In the project repository, run: python googleimagescraper_cli.py --search-keys cat ``` Optional arguments: - ``` - --number_of_images: The number of images to download (default: 5) - --headless: Whether to run in headless mode (default: False) - --min_resolution: The minimum desired image resolution (default: (0,0)) - --max_resolution: The maximum desired image resolution (default: (10000,10000)) - --max_missed: The maximum number of failed images before exiting (default: 10) - --number_of_workers: The number of workers to use (default: 1) - --keep_filenames: Whether to keep original URL image filenames (default: False) - --searchname: The name of the subfolder in which images will be saved (default: "search") - ``` + +``` +--number_of_images: The number of images to download (default: 5) +--headless: Whether to run in headless mode (default: False) +--min_resolution: The minimum desired image resolution (default: (0,0)) +--max_resolution: The maximum desired image resolution (default: (10000,10000)) +--max_missed: The maximum number of failed images before exiting (default: 10) +--number_of_workers: The number of workers to use (default: 1) +--keep_filenames: Whether to keep original URL image filenames (default: False) +--searchname: The name of the subfolder in which images will be saved (default: "search") +``` + Otherwise, define your desired parameters in main.py and run through the command line: ``` python main.py From 39d5f2f74e38bb527d475a51127ac39fde66e36d Mon Sep 17 00:00:00 2001 From: ruggsea <55443134+ruggsea@users.noreply.github.com> Date: Wed, 10 May 2023 17:13:53 +0200 Subject: [PATCH 5/5] Update scraper to make it compatible to cli --- GoogleImageScraper.py | 418 +++++++++++++++++++++--------------------- 1 file changed, 209 insertions(+), 209 deletions(-) diff --git a/GoogleImageScraper.py b/GoogleImageScraper.py index 80134a80..1962cb40 100644 --- a/GoogleImageScraper.py +++ b/GoogleImageScraper.py @@ -1,209 +1,209 @@ -# -*- coding: utf-8 -*- -""" -Created on Sat Jul 18 13:01:02 2020 - -@author: OHyic -""" -#import selenium drivers -from selenium import webdriver -from selenium.webdriver.chrome.options import Options -from selenium.webdriver.common.by import By -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as EC -from selenium.common.exceptions import NoSuchElementException - -#import helper libraries -import time -import urllib.request -from urllib.parse import urlparse -import os -import requests -import io -from PIL import Image -import re - -#custom patch libraries -import patch - -class GoogleImageScraper(): - def __init__(self, webdriver_path, image_path, search_key="cat", number_of_images=1, headless=True, min_resolution=(0, 0), max_resolution=(1920, 1080), max_missed=10): - #check parameter types - image_path = os.path.join(image_path, search_key) - if (type(number_of_images)!=int): - print("[Error] Number of images must be integer value.") - return - if not os.path.exists(image_path): - print("[INFO] Image path not found. Creating a new folder.") - os.makedirs(image_path) - - #check if chromedriver is installed - if (not os.path.isfile(webdriver_path)): - is_patched = patch.download_lastest_chromedriver() - if (not is_patched): - exit("[ERR] Please update the chromedriver.exe in the webdriver folder according to your chrome version:https://chromedriver.chromium.org/downloads") - - for i in range(1): - try: - #try going to www.google.com - options = Options() - if(headless): - options.add_argument('--headless') - driver = webdriver.Chrome(webdriver_path, chrome_options=options) - driver.set_window_size(1400,1050) - driver.get("https://www.google.com") - try: - WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.ID, "W0wltc"))).click() - except Exception as e: - continue - except Exception as e: - #update chromedriver - pattern = '(\d+\.\d+\.\d+\.\d+)' - version = list(set(re.findall(pattern, str(e))))[0] - is_patched = patch.download_lastest_chromedriver(version) - if (not is_patched): - exit("[ERR] Please update the chromedriver.exe in the webdriver folder according to your chrome version:https://chromedriver.chromium.org/downloads") - - self.driver = driver - self.search_key = search_key - self.number_of_images = number_of_images - self.webdriver_path = webdriver_path - self.image_path = image_path - self.url = "https://www.google.com/search?q=%s&source=lnms&tbm=isch&sa=X&ved=2ahUKEwie44_AnqLpAhUhBWMBHUFGD90Q_AUoAXoECBUQAw&biw=1920&bih=947"%(search_key) - self.headless=headless - self.min_resolution = min_resolution - self.max_resolution = max_resolution - self.max_missed = max_missed - - def find_image_urls(self): - """ - This function search and return a list of image urls based on the search key. - Example: - google_image_scraper = GoogleImageScraper("webdriver_path","image_path","search_key",number_of_photos) - image_urls = google_image_scraper.find_image_urls() - - """ - print("[INFO] Gathering image links") - self.driver.get(self.url) - image_urls=[] - count = 0 - missed_count = 0 - indx_1 = 0 - indx_2 = 0 - search_string = '//*[@id="islrg"]/div[1]/div[%s]/a[1]/div[1]/img' - time.sleep(3) - while self.number_of_images > count and missed_count < self.max_missed: - if indx_2 > 0: - try: - imgurl = self.driver.find_element(By.XPATH, search_string%(indx_1,indx_2+1)) - imgurl.click() - indx_2 = indx_2 + 1 - missed_count = 0 - except Exception: - try: - imgurl = self.driver.find_element(By.XPATH, search_string%(indx_1+1,1)) - imgurl.click() - indx_2 = 1 - indx_1 = indx_1 + 1 - except: - indx_2 = indx_2 + 1 - missed_count = missed_count + 1 - else: - try: - imgurl = self.driver.find_element(By.XPATH, search_string%(indx_1+1)) - imgurl.click() - missed_count = 0 - indx_1 = indx_1 + 1 - except Exception: - try: - imgurl = self.driver.find_element(By.XPATH, '//*[@id="islrg"]/div[1]/div[%s]/div[%s]/a[1]/div[1]/img'%(indx_1,indx_2+1)) - imgurl.click() - missed_count = 0 - indx_2 = indx_2 + 1 - search_string = '//*[@id="islrg"]/div[1]/div[%s]/div[%s]/a[1]/div[1]/img' - except Exception: - indx_1 = indx_1 + 1 - missed_count = missed_count + 1 - - try: - #select image from the popup - time.sleep(1) - class_names = ["n3VNCb","iPVvYb","r48jcc","pT0Scc"] - images = [self.driver.find_elements(By.CLASS_NAME, class_name) for class_name in class_names if len(self.driver.find_elements(By.CLASS_NAME, class_name)) != 0 ][0] - for image in images: - #only download images that starts with http - src_link = image.get_attribute("src") - if(("http" in src_link) and (not "encrypted" in src_link)): - print( - f"[INFO] {self.search_key} \t #{count} \t {src_link}") - image_urls.append(src_link) - count +=1 - break - except Exception: - print("[INFO] Unable to get link") - - try: - #scroll page to load next image - if(count%3==0): - self.driver.execute_script("window.scrollTo(0, "+str(indx_1*60)+");") - element = self.driver.find_element(By.CLASS_NAME,"mye4qd") - element.click() - print("[INFO] Loading next page") - time.sleep(3) - except Exception: - time.sleep(1) - - - - self.driver.quit() - print("[INFO] Google search ended") - return image_urls - - def save_images(self,image_urls, keep_filenames): - print(keep_filenames) - #save images into file directory - """ - This function takes in an array of image urls and save it into the given image path/directory. - Example: - google_image_scraper = GoogleImageScraper("webdriver_path","image_path","search_key",number_of_photos) - image_urls=["https://example_1.jpg","https://example_2.jpg"] - google_image_scraper.save_images(image_urls) - - """ - print("[INFO] Saving image, please wait...") - for indx,image_url in enumerate(image_urls): - try: - print("[INFO] Image url:%s"%(image_url)) - search_string = ''.join(e for e in self.search_key if e.isalnum()) - image = requests.get(image_url,timeout=5) - if image.status_code == 200: - with Image.open(io.BytesIO(image.content)) as image_from_web: - try: - if (keep_filenames): - #extact filename without extension from URL - o = urlparse(image_url) - image_url = o.scheme + "://" + o.netloc + o.path - name = os.path.splitext(os.path.basename(image_url))[0] - #join filename and extension - filename = "%s.%s"%(name,image_from_web.format.lower()) - else: - filename = "%s%s.%s"%(search_string,str(indx),image_from_web.format.lower()) - - image_path = os.path.join(self.image_path, filename) - print( - f"[INFO] {self.search_key} \t {indx} \t Image saved at: {image_path}") - image_from_web.save(image_path) - except OSError: - rgb_im = image_from_web.convert('RGB') - rgb_im.save(image_path) - image_resolution = image_from_web.size - if image_resolution != None: - if image_resolution[0]self.max_resolution[0] or image_resolution[1]>self.max_resolution[1]: - image_from_web.close() - os.remove(image_path) - - image_from_web.close() - except Exception as e: - print("[ERROR] Download failed: ",e) - pass - print("--------------------------------------------------") - print("[INFO] Downloads completed. Please note that some photos were not downloaded as they were not in the correct format (e.g. jpg, jpeg, png)") +# -*- coding: utf-8 -*- +""" +Created on Sat Jul 18 13:01:02 2020 + +@author: OHyic +""" +#import selenium drivers +from selenium import webdriver +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.common.exceptions import NoSuchElementException + +#import helper libraries +import time +import urllib.request +from urllib.parse import urlparse +import os +import requests +import io +from PIL import Image +import re + +#custom patch libraries +import patch + +class GoogleImageScraper(): + def __init__(self, webdriver_path, image_path,searchname, search_key="cat", number_of_images=1, headless=True, min_resolution=(0, 0), max_resolution=(1920, 1080), max_missed=10): + #check parameter types + image_path = os.path.join(image_path, searchname) + if (type(number_of_images)!=int): + print("[Error] Number of images must be integer value.") + return + if not os.path.exists(image_path): + print("[INFO] Image path not found. Creating a new folder.") + os.makedirs(image_path) + + #check if chromedriver is installed + if (not os.path.isfile(webdriver_path)): + is_patched = patch.download_lastest_chromedriver() + if (not is_patched): + exit("[ERR] Please update the chromedriver.exe in the webdriver folder according to your chrome version:https://chromedriver.chromium.org/downloads") + + for i in range(1): + try: + #try going to www.google.com + options = Options() + if(headless): + options.add_argument('--headless') + driver = webdriver.Chrome(webdriver_path, chrome_options=options) + driver.set_window_size(1400,1050) + driver.get("https://www.google.com") + try: + WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.ID, "W0wltc"))).click() + except Exception as e: + continue + except Exception as e: + #update chromedriver + pattern = '(\d+\.\d+\.\d+\.\d+)' + version = list(set(re.findall(pattern, str(e))))[0] + is_patched = patch.download_lastest_chromedriver(version) + if (not is_patched): + exit("[ERR] Please update the chromedriver.exe in the webdriver folder according to your chrome version:https://chromedriver.chromium.org/downloads") + + self.driver = driver + self.search_key = search_key + self.number_of_images = number_of_images + self.webdriver_path = webdriver_path + self.image_path = image_path + self.url = "https://www.google.com/search?q=%s&source=lnms&tbm=isch&sa=X&ved=2ahUKEwie44_AnqLpAhUhBWMBHUFGD90Q_AUoAXoECBUQAw&biw=1920&bih=947"%(search_key) + self.headless=headless + self.min_resolution = min_resolution + self.max_resolution = max_resolution + self.max_missed = max_missed + + def find_image_urls(self): + """ + This function search and return a list of image urls based on the search key. + Example: + google_image_scraper = GoogleImageScraper("webdriver_path","image_path","search_key",number_of_photos) + image_urls = google_image_scraper.find_image_urls() + + """ + print("[INFO] Gathering image links") + self.driver.get(self.url) + image_urls=[] + count = 0 + missed_count = 0 + indx_1 = 0 + indx_2 = 0 + search_string = '//*[@id="islrg"]/div[1]/div[%s]/a[1]/div[1]/img' + time.sleep(3) + while self.number_of_images > count and missed_count < self.max_missed: + if indx_2 > 0: + try: + imgurl = self.driver.find_element(By.XPATH, search_string%(indx_1,indx_2+1)) + imgurl.click() + indx_2 = indx_2 + 1 + missed_count = 0 + except Exception: + try: + imgurl = self.driver.find_element(By.XPATH, search_string%(indx_1+1,1)) + imgurl.click() + indx_2 = 1 + indx_1 = indx_1 + 1 + except: + indx_2 = indx_2 + 1 + missed_count = missed_count + 1 + else: + try: + imgurl = self.driver.find_element(By.XPATH, search_string%(indx_1+1)) + imgurl.click() + missed_count = 0 + indx_1 = indx_1 + 1 + except Exception: + try: + imgurl = self.driver.find_element(By.XPATH, '//*[@id="islrg"]/div[1]/div[%s]/div[%s]/a[1]/div[1]/img'%(indx_1,indx_2+1)) + imgurl.click() + missed_count = 0 + indx_2 = indx_2 + 1 + search_string = '//*[@id="islrg"]/div[1]/div[%s]/div[%s]/a[1]/div[1]/img' + except Exception: + indx_1 = indx_1 + 1 + missed_count = missed_count + 1 + + try: + #select image from the popup + time.sleep(1) + class_names = ["n3VNCb","iPVvYb","r48jcc","pT0Scc"] + images = [self.driver.find_elements(By.CLASS_NAME, class_name) for class_name in class_names if len(self.driver.find_elements(By.CLASS_NAME, class_name)) != 0 ][0] + for image in images: + #only download images that starts with http + src_link = image.get_attribute("src") + if(("http" in src_link) and (not "encrypted" in src_link)): + print( + f"[INFO] {self.search_key} \t #{count} \t {src_link}") + image_urls.append(src_link) + count +=1 + break + except Exception: + print("[INFO] Unable to get link") + + try: + #scroll page to load next image + if(count%3==0): + self.driver.execute_script("window.scrollTo(0, "+str(indx_1*60)+");") + element = self.driver.find_element(By.CLASS_NAME,"mye4qd") + element.click() + print("[INFO] Loading next page") + time.sleep(3) + except Exception: + time.sleep(1) + + + + self.driver.quit() + print("[INFO] Google search ended") + return image_urls + + def save_images(self,image_urls, keep_filenames): + print(keep_filenames) + #save images into file directory + """ + This function takes in an array of image urls and save it into the given image path/directory. + Example: + google_image_scraper = GoogleImageScraper("webdriver_path","image_path","search_key",number_of_photos) + image_urls=["https://example_1.jpg","https://example_2.jpg"] + google_image_scraper.save_images(image_urls) + + """ + print("[INFO] Saving image, please wait...") + for indx,image_url in enumerate(image_urls): + try: + print("[INFO] Image url:%s"%(image_url)) + search_string = ''.join(e for e in self.search_key if e.isalnum()) + image = requests.get(image_url,timeout=5) + if image.status_code == 200: + with Image.open(io.BytesIO(image.content)) as image_from_web: + try: + if (keep_filenames): + #extact filename without extension from URL + o = urlparse(image_url) + image_url = o.scheme + "://" + o.netloc + o.path + name = os.path.splitext(os.path.basename(image_url))[0] + #join filename and extension + filename = "%s.%s"%(name,image_from_web.format.lower()) + else: + filename = "%s%s.%s"%(search_string,str(indx),image_from_web.format.lower()) + + image_path = os.path.join(self.image_path, filename) + print( + f"[INFO] {self.search_key} \t {indx} \t Image saved at: {image_path}") + image_from_web.save(image_path) + except OSError: + rgb_im = image_from_web.convert('RGB') + rgb_im.save(image_path) + image_resolution = image_from_web.size + if image_resolution != None: + if image_resolution[0]self.max_resolution[0] or image_resolution[1]>self.max_resolution[1]: + image_from_web.close() + os.remove(image_path) + + image_from_web.close() + except Exception as e: + print("[ERROR] Download failed: ",e) + pass + print("--------------------------------------------------") + print("[INFO] Downloads completed. Please note that some photos were not downloaded as they were not in the correct format (e.g. jpg, jpeg, png)")