diff --git a/.vscode/tags b/.vscode/tags index f0d7ffc..b4db7d7 100644 --- a/.vscode/tags +++ b/.vscode/tags @@ -10,15 +10,43 @@ active_user ../crawler.py /^ active_user = ""$/;" kind:variable line:20 active_user ../crawler.py /^ active_user = sys.argv[1]$/;" kind:variable line:14 active_user ../crawler.py /^active_user = "twittersoz"$/;" kind:variable line:12 actual_crawl ../crawler.py /^actual_crawl = 0$/;" kind:variable line:8 +check_existed_link_id ../get_data_from_image.py /^def check_existed_link_id(link_id):$/;" kind:function line:7 +check_existed_user ../export_links_db.py /^def check_existed_user(username):$/;" kind:function line:17 +concepts ../get_data_from_image.py /^ concepts = get_API(row[2])$/;" kind:variable line:40 crawler.py ../crawler.py 1;" kind:file line:1 -driver ../get_data.py /^driver = webdriver.Firefox()$/;" kind:variable line:5 +cursor ../export_links_db.py /^cursor = db.cursor()$/;" kind:variable line:27 +cursor ../export_users_db.py /^ cursor = db.cursor()$/;" kind:variable line:11 +cursor ../export_users_db.py /^cursor = db.cursor()$/;" kind:variable line:17 +cursor ../get_data_from_image.py /^cursor = db.cursor()$/;" kind:variable line:28 +cursor ../show_users.py /^cursor = db.cursor()$/;" kind:variable line:5 +cursor2 ../get_data_from_image.py /^ cursor2 = db.cursor()$/;" kind:variable line:41 +db ../export_links_db.py /^db = sqlite3.connect('web\/data\/crawler')$/;" kind:variable line:4 +db ../export_users_db.py /^db = sqlite3.connect('web\/data\/crawler')$/;" kind:variable line:7 +db ../get_data_from_image.py /^db = sqlite3.connect('.\/web\/data\/crawler')$/;" kind:variable line:5 +db ../show_users.py /^db = sqlite3.connect('.\/web\/data\/crawler')$/;" kind:variable line:3 e ../crawler.py /^e = Crawler(active_user)$/;" kind:variable line:89 -get_data.py ../get_data.py 1;" kind:file line:1 +export_links_db.py ../export_links_db.py 1;" kind:file line:1 +export_users_db.py ../export_users_db.py 1;" kind:file line:1 +file_exist ../export_users_db.py /^file_exist = os.path.isfile('.\/web\/data\/crawler') $/;" kind:variable line:5 +file_links ../get_images_from_profile.py /^ file_links = open("links_" + userbo + ".txt","w") $/;" kind:variable line:29 +get_API ../get_data_from_image.py /^def get_API(photo_url):$/;" kind:function line:15 +get_data_from_image.py ../get_data_from_image.py 1;" kind:file line:1 +get_images_from_profile.py ../get_images_from_profile.py 1;" kind:file line:1 +get_img_url ../get_images_from_profile.py /^def get_img_url(url):$/;" kind:function line:5 +get_links ../export_links_db.py /^def get_links(username):$/;" kind:function line:7 join ../crawler.py /^ def join( self ):$/;" kind:member line:81 +links ../get_images_from_profile.py /^ links = get_img_url(userbo)$/;" kind:variable line:30 +llistat ../export_links_db.py /^ llistat = get_links(userbo)$/;" kind:variable line:34 max_crawl ../crawler.py /^max_crawl = 5$/;" kind:variable line:7 parse_users ../crawler.py /^ def parse_users(self, soup):$/;" kind:member line:38 +photo_id ../get_data_from_image.py /^ photo_id = str(row[0])$/;" kind:variable line:32 run ../crawler.py /^ def run (self):$/;" kind:member line:27 scanned_users ../crawler.py /^ scanned_users = []$/;" kind:variable line:19 -soup ../get_data.py /^soup = BeautifulSoup(driver.page_source)$/;" kind:variable line:8 -url ../get_data.py /^url = 'http:\/\/instagram.com\/umnpics\/'$/;" kind:variable line:4 +show_users.py ../show_users.py 1;" kind:file line:1 +user ../export_links_db.py /^ user = filename.split("links_")[1]$/;" kind:variable line:31 +user ../export_users_db.py /^ user = filename.split("users_")[1]$/;" kind:variable line:21 +user ../get_images_from_profile.py /^ user = filename.split("users_")[1]$/;" kind:variable line:27 +userbo ../export_links_db.py /^ userbo = user.split(".txt")[0]$/;" kind:variable line:32 +userbo ../export_users_db.py /^ userbo = user.split(".txt")[0]$/;" kind:variable line:22 +userbo ../get_images_from_profile.py /^ userbo = user.split(".txt")[0]$/;" kind:variable line:28 users ../crawler.py /^ users = []$/;" kind:variable line:18 diff --git a/crawler.py b/crawler.py index 19ccd9a..891ee0a 100644 --- a/crawler.py +++ b/crawler.py @@ -34,6 +34,15 @@ def run (self): # Parse users self.parse_users(soup) + def check_exist(self, username): + file_users_analized = open("analized_users.txt","r") + loglist = file_users_analized.readlines() + file_users_analized.close() + found = False + for line in loglist: + if str(username) in line: + found = True + return found def parse_users(self, soup): @@ -60,7 +69,7 @@ def parse_users(self, soup): #print element file_users.close() #Add user to users analized - file_users_analized = open("analized_users.txt","w+") + file_users_analized = open("analized_users.txt","w") file_users_analized.write(self.active_user + "\n") file_users_analized.close() actual_crawl -= 1 @@ -73,8 +82,9 @@ def parse_users(self, soup): time.sleep(5) else: print "worker " + usuari - actual_crawl += 1 - self.child = subprocess.Popen([sys.executable, './crawler.py', usuari]) + if ( check_exist(usuari) == False): + actual_crawl += 1 + self.child = subprocess.Popen([sys.executable, './crawler.py', usuari])