-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfilmesbrasil_cr.py
130 lines (103 loc) · 3.59 KB
/
filmesbrasil_cr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import bs4 as bs
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
import time
import jsonpickle
import json
browser = webdriver.Chrome("C:/Users/renan/Tutorial/chromedriver.exe")
browser.maximize_window()
browser.execute_script("document.body.style.zoom='100'")
# use the page source on Beautiful just like if it was done through requests.
list_of_channels = ['Canal Brasil',
'Megapix',
'Cinemax',
'Prime Box Brazil',
'Studio Universal',
'Paramount'
]
list_of_links = [ 'http://canalbrasil.globo.com/programacao.html',
'http://megapix.globo.com/',
'https://br.cinemax.tv/schedule',
'http://www.boxbrazil.tv.br/prime-box-brazil-grade-de-programacao/',
'http://studiouniversal.globo.com/programacao.html',
'https://www.netcombo.com.br/tv-por-assinatura/programacao/canal/paramount-447'
]
# Code for Canal Brasil!
# Get the link Canal Brasil
url = (list_of_links[0])
#Open the url with Selenium, since the data that I need is being rendered through Javascript,
#I have to use Selenium.
browser.get(url)
#Using Xpath because I can iterate through the items.
class Text():
def __init__(self, time, subtitulo, complemento, classificacao,year,origin_country):
self.time = time
self.subtitulo = subtitulo
self.complemento = complemento
self.classificacao = classificacao
self.year = year
self.origin_country = origin_country
def getObjct():
items = soup.find_all('div', class_='episodio')
for i in items:
b = Text(i.find('span', class_='hora').text.strip(),
i.find('h5', class_='subtitulo').text.strip(),
i.find('p', class_='complemento').text.strip(),
i.find('span', class_='classificacao').text.strip(),
i.find('span', class_='ano').text.strip(),
i.find('span', class_='pais').text.strip()
)
my_text.append(b)
my_text = []
for i in range(7,21):
number = i
next_button_xpath = '//span[@class="seta proximo"]//span[@class="icone"]'
if(number < 11):
xpath = f'//*[@id="area-background"]/main/div/div[2]/div[1]/div/ul/div/div/div[{number}]'
delay = 3
try:
myElem = WebDriverWait(browser, delay).until(EC.presence_of_element_located((By.XPATH, xpath)))
print("Page is ready!")
except TimeoutException:
print("Loading took too much time!")
myElem.click()
time.sleep(2)
# get the page_source.
html_source = browser.page_source
soup = bs.BeautifulSoup(html_source, "lxml")
#Getting information about the movie
getObjct()
else:
next_button_xpath = '//span[@class="seta proximo"]'
browser.find_element_by_xpath(next_button_xpath).click()
_xpath= f"//div[@class='owl-wrapper']//div[{number}]"
time.sleep(2)
browser.find_element_by_xpath(_xpath).click()
# use the page source on Beautiful just like if it was done through requests.
html_source = browser.page_source
soup = bs.BeautifulSoup(html_source, "lxml")
#Getting information about the movie
getObjct()
# close the browser
browser.quit()
my_file = jsonpickle.encode(my_text)
print(len(my_text))
'''
for i in my_text:
print(i.time)
print(i.subtitulo)
print(i.complemento)
print(i.classificacao)
print(i.year)
print(i.origin_country)
print("-------------------------")
# get class titulo to get either a movie/cinemao @@ whatever da fuck that shit is.
# get class subtitulo to get the titulo
# get class complemento for resumo
# get class classificacao for age restriction
# get class ano for year of the movie
# get class pais to get the origin of the movie
'''