-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcrawling.py
95 lines (67 loc) · 3.5 KB
/
crawling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import requests
from bs4 import BeautifulSoup
import re
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
def no_space(text):
text1 = re.sub(' | | \n|\t|\r', '', text)
text2 = re.sub('\n\n', '', text1)
return text2.strip()
def get_movie_info():
data = requests.get('https://movie.naver.com/movie/running/current.nhn',headers=headers)
soup = BeautifulSoup(data.text, 'html.parser')
movies = soup.select('#content > div.article > div:nth-child(1) > div.lst_wrap > ul > li')
movie_list = list()
for idx, movie in enumerate(movies):
movie_info = dict()
title = movie.select_one('dl > dt > a')
reserve = movie.select_one('dl > dd.info_t1 > div > a')
age = movie.select_one('dl > dt > span')
detail = movie.select_one('dl > dd:nth-child(3) > dl')
summary = detail.select_one('dd:nth-child(2)')
genre = summary.select_one('span.link_txt')
director = detail.select_one('dd:nth-child(4) > span > a')
actor = detail.select_one('dd:nth-child(6) > span')
summary = list(map(no_space, summary.text.split('|')))
[show_time] = list(filter(lambda x: re.search('분$', x), summary))
[opening_date] = list(filter(lambda x: re.search('개봉$', x), summary))
movie_info['code'] = idx + 1
movie_info['title'] = title.text
movie_info['img'] = movie.select_one('div > a > img')['src'].split('?')[0]
movie_info['link'] = 'https://movie.naver.com' + title['href']
movie_info['id'] = int(movie_info['link'].split('?')[1].split('=')[1])
if reserve: movie_info['reserve'] = 'https://movie.naver.com' + reserve['href']
if age: movie_info['age'] = age.text
if genre: movie_info['genre'] = no_space(genre.text)
if show_time: movie_info['show_time'] = show_time
if opening_date: movie_info['opening_date'] = opening_date
if director: movie_info['director'] = no_space(director.text)
if actor: movie_info['actor'] = no_space(actor.text)
movie_list.append(movie_info)
return movie_list
movies = get_movie_info()
movie_list = dict()
for movie in movies:
movie_list[movie['id']] = movie
def get_movie_summary(id):
target = movie_list[id]
url = target['link']
data = requests.get(url, headers=headers)
soup = BeautifulSoup(data.text, 'html.parser')
movie = soup.select_one('#content > div.article > div.section_group.section_group_frst > div:nth-child(1) > div > div.story_area')
summary_tit = movie.select_one('h5')
summary_des = movie.select_one('p')
detail_info = dict()
detail_info['title'] = target['title']
detail_info['img'] = target['img']
detail_info['link'] = target['link']
detail_info['id'] = target['link'].split('?')[1].split('=')[1]
if "opening_date" in target: detail_info['opening_date'] = target['opening_date']
if "genre" in target: detail_info['genre'] = target['genre']
if "show_time" in target: detail_info['show_time'] = target['show_time']
if "director" in target: detail_info['director'] = target['director']
if "actor" in target: detail_info['actor'] = target['actor']
if "age" in target: detail_info['age'] = target['age']
if "reserve" in target: detail_info['reserve'] = target['reserve']
if summary_tit: detail_info['summary_tit'] = summary_tit.text
if summary_des: detail_info['summary_des'] = summary_des.text
return detail_info