-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgetMarketInfo.rb
279 lines (236 loc) · 9.66 KB
/
getMarketInfo.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
# **************************************************
# Crawling YahooFinance and get infomation
# and update DB with the data.
# Get with your favorite rating and days.
# **************************************************
# Date prepared : 2014-11-30
# Date updated : 2018-01-16
# Copyright (c) 2014 Shota Taniguchi
# Released under the MIT license
# http://opensource.org/licenses/mit-license.php
# **************************************************
require 'rubygems'
require "capybara"
require "capybara/dsl"
require "selenium-webdriver"
require "Date"
require 'digest/sha1'
require 'sqlite3'
require 'rexml/document'
require "webdrivers"
class YahooFinance
include Capybara::DSL
# Database name
DATABASE_NAME = "marketCal.sqlite3"
# Target Table name
DATASTORE_TABLE_NAME = "TB_Ymarket"
# get rating and days count of your preference from settings.xml
def get_crawling_settings
# ReadSettings from settings.xml file.
doc = REXML::Document.new(open("./settings.xml"))
# Get how many days you'd like to get.
@getDays = doc.elements['settings/getDays'].text
if @getDays == nil || @getDays == "0"
# the default is 5 days.
@getDays = "5"
end
puts ""
puts "Crawling Days : " + @getDays
# Get type of the events.
case doc.elements['settings/getImportance'].text
when "1"
@getImportance = "★"
when "2"
@getImportance = "★★"
when "3"
@getImportance = "★★★"
else
@getImportance = "すべて"
end
puts "Crawling rating : " + @getImportance
end
# Get settings and start by PhantomJS
def init_crawling
# Open Site
# page.driver.headers ={"User-Agent" => "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)"}
visit('/')
puts ""
puts "SiteOpen"
# Open DataBase (create database file if not exists)
@db = SQLite3::Database.new(DATABASE_NAME)
# Create Target Table if it doesn't exist.
if ! @db.execute("Select tbl_name from sqlite_master where type ='table';").flatten.include?(DATASTORE_TABLE_NAME)
strCreate ="Create Table TB_Ymarket(id text primary key, event text, rating integer, date text);";
@db.execute(strCreate)
end
puts ""
puts "Connected to Database."
end
# waiting for ajax response(use this if the site uses ajax)
def wait_for_ajax(waitSeconds)
sleep waitSeconds
Timeout.timeout(Capybara.default_wait_time) do
active = page.evaluate_script("jQuery.active")
until active == 0
sleep 1
active = page.evaluate_script("jQuery.active")
end
end
end
# crawling site, and save it to database.
def get_economic_calendar
begin
# get economic events
elapsed_days_count = 0
start_day = Date.today
while elapsed_days_count < @getDays.to_i
# initialize start row count
table_row_count = 0
# Set Crawling Day
crawling_target_day = start_day + elapsed_days_count
date_input = "#{crawling_target_day.year.to_s}/#{crawling_target_day.month.to_s}/#{crawling_target_day.day.to_s}"
date_hidden = crawling_target_day.strftime("%Y%m%d")
# Search datapicker with css,and set the day to the Textbox
find(".datepicker").set(date_input)
# Set to the hidden element(by JavaScript)
execute_script("document.getElementById('ymd').value = #{date_hidden}");
# Search element with defined id(country), and select all country.
find("#country").select("すべて")
# Search element which name is 'i', and select value with getImportance
select @getImportance, :from => 'i'
# Execution Click
click_button("selectBtn")
# Set baseRowCount(the row count that starts getting infomation)
baseTr = 2
puts "----------------------------------------"
# By XPath, find target table row.
event_date = ''
within(:xpath, %Q|//*[@id="main"]/div[3]/table/tbody/tr[#{baseTr + table_row_count}]|) do
# get event day from source.
event_date = all('th')[0].text.match(/\d+\/\d+/).to_s
puts "getDay:" + event_date
end
# initialize the loop requirement
enableLoop = true
# get infomation until the next day
table_row_count += 1
while enableLoop == true do
within(:xpath,%Q|//*[@id="main"]/div[3]/table/tbody/tr[#{baseTr + table_row_count}]|) do
# initialize
event_rate = 0
unique_event_value = ""
if has_css?(".yjMS")
# no publication day
enableLoop = false
puts "発表なし"
break
elsif has_css?(".date")
# reached to the next day
enableLoop = false
break
else
# when found valid event
# get the event's time and content
event_time = all('td')[0].text
event_content = all('td')[1].text
puts "Time:" + event_time
puts "Event:" + event_content
# get the event's rating
if has_css?(".icoRating3")
event_rate = 3
elsif has_css?(".icoRating2")
event_rate = 2
elsif has_css?(".icoRating1")
event_rate = 1
else
event_rate = 0
end
puts "Rating:" + event_rate.to_s
# get month and day from event date
event_month = event_date.match(/\d+/).to_s
event_day = event_date.match(/\/(\d+)/)[1].to_s
# create date object for insert the event data to datebase.
if start_day.strftime("%m") == "12" && event_date.slice(0,2) == "1/"
date_register = Date.new(start_day.year.to_i + 1,event_month.to_i,event_day.to_i)
else
date_register = Date.new(start_day.year.to_i,event_month.to_i,event_day.to_i)
end
# Register to DB if the time isn't undefined
if event_time != "未定"
event_hour = event_time.match(/\d+/).to_s
event_minutes = event_time.match(/\:(\d+)/)[1].to_s
# if the time is above p.m.25, add the day
if event_hour.to_i >= 24
date_register = date_register + 1
event_hour = event_hour.to_i - 24
end
# for sqlite, create strings like a date object(sqlite doesn't have date type, but can control with only limited text format)
defined_format_event_date = "#{date_register.strftime("%Y")}-#{date_register.strftime("%m")}-#{date_register.strftime("%d")} #{event_hour}:#{event_minutes}:00"
puts defined_format_event_date
# create sha1 to use the table key
unique_event_value = Digest::SHA1.hexdigest(defined_format_event_date + event_content + event_rate.to_s);
puts "SHA1:" + unique_event_value
# search with the key, insert the infomatin if not exist.
select_statement = "Select Count(0) from #{DATASTORE_TABLE_NAME.to_s} Where id = '#{unique_event_value}'"
@db.execute(select_statement) do |row|
if row[0].to_s == "0"
strInsert ="Insert into #{DATASTORE_TABLE_NAME} Values('#{unique_event_value}','#{event_content}',#{event_rate},'#{defined_format_event_date}');"
#Logic of Insert to Sqlite database
@db.execute(strInsert)
puts "Inserted to DB."
else
puts "Already registered."
end
puts "********************"
end
end
table_row_count +=1
end
end
end
# after getting all infomation of the day, add the date count.
elapsed_days_count+=1
puts "----------------------------------------"
end
rescue => ex
puts ex.message
puts ex.backtrace
ensure
@db.close
end
end
end
# Configure Capybara and get it started#
# for don't use RackApp
Capybara.run_server = false
# using Driver(default:default_driver ,the default is rack_test)
Capybara.current_driver = :selenium_chrome_headless
# JavascriptDriver(default:Selenium)
Capybara.javascript_driver = :selenium_chrome_headless
# target site
Capybara.app_host = %q|https://info.finance.yahoo.co.jp/fx/marketcalendar/|
# ajax_waiting time(seconds)
Capybara.default_max_wait_time = 5
# hidden_access(default:true ,unconcerned with changing the DOM with javascript)
Capybara.ignore_hidden_elements = true
Capybara.register_driver :selenium_chrome_headless do |app|
Capybara::Selenium::Driver.load_selenium
ua = "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)"
browser_options = ::Selenium::WebDriver::Chrome::Options.new.tap do |opts|
opts.args << '--headless'
opts.args << '--disable-gpu' if Gem.win_platform?
opts.args << "--user-agent=#{ua}"
# Workaround https://bugs.chromium.org/p/chromedriver/issues/detail?id=2650&q=load&sort=-id&colspec=ID%20Status%20Pri%20Owner%20Summary
opts.args << '--disable-site-isolation-trials'
end
Capybara::Selenium::Driver.new(app, browser: :chrome, options: browser_options)
end
puts "Started Crawling."
# create YahooFinance Crawler
crawer = YahooFinance.new
# get settings and go to the site
crawer.get_crawling_settings
crawer.init_crawling
# get infomation and register data to database
crawer.get_economic_calendar
puts "Finished Crawling."