Skip to content

Commit

Permalink
granary: Update scraping logic to handle feed HTML JSON changes
Browse files Browse the repository at this point in the history
  • Loading branch information
snarfed committed Nov 5, 2018
1 parent 45eaba0 commit 2839525
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 39 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,7 @@ Changelog
Add `delete()`. Currently includes Twitter support.
* Instagram:
* Make extra HTTP fetch (with cookie) to get individual likes ([snarfed/bridgy#840](https://github.com/snarfed/bridgy/issues/840)).
* Update scraping logic to handle feed HTML changes.
* Link @-mentions in comments as well as photo/video captions.
* GitHub:
* `create`/`preview_create` bug fixes for issues and comments on private repos.
Expand Down
71 changes: 37 additions & 34 deletions granary/instagram.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@
r'^/graphql/query/\?query_hash=[^&]*&(amp;)?variables=(%7B%7D|{})$')
# https://github.com/snarfed/bridgy/issues/840
HTML_LIKES_URL = HTML_BASE_URL + 'graphql/query/?query_hash=e0f59e4a1c8d78d0161873bc2ee7ec44&variables={"shortcode":"%s","include_reel":false,"first":24}'
HTML_DATA_RE = re.compile(r"""
<script\ type="text/javascript">
window\.(_sharedData\ =|__additionalDataLoaded\('feed',)\ *
(.+?)
\)?;</script>""", re.VERBOSE)

# URL-safe base64 encoding. used in Instagram.id_to_shortcode()
BASE64 = string.ascii_uppercase + string.ascii_lowercase + string.digits + '-_'
Expand Down Expand Up @@ -793,9 +798,8 @@ def html_to_activities(self, html, cookie=None, count=None, fetch_extras=False):
"""
# extract JSON data blob
# (can also get just this JSON by adding ?__a=1 to any IG URL.)
script_start = '<script type="text/javascript">window._sharedData = '
start = html.find(script_start)
if start == -1:
matches = HTML_DATA_RE.findall(html)
if not matches:
# Instagram sometimes returns 200 with incomplete HTML. often it stops at
# the end of one of the <style> tags inside <head>. not sure why.
logging.warning('JSON script tag not found!')
Expand All @@ -812,41 +816,38 @@ def html_to_activities(self, html, cookie=None, count=None, fetch_extras=False):
except ImportError:
json_module = json

start += len(script_start)
end = html.find(';</script>', start)
if end == -1:
# as mentioned above, Instagram sometimes returns 200 with incomplete HTML
logging.warning('JSON script close tag not found!')
return [], None
data = util.trim_nulls(json_module.loads(html[start:end]))

entry_data = data.get('entry_data', {})
activities = []

# find media
medias = []
profile_user = None

# home page ie news feed
for page in entry_data.get('FeedPage', []):
edges = page.get('graphql', {}).get('user', {})\
.get('edge_web_feed_timeline', {}).get('edges', [])
medias.extend(e.get('node') for e in edges
if e.get('node', {}).get('__typename') not in
('GraphSuggestedUserFeedUnit',))

# profiles
for page in entry_data.get('ProfilePage', []):
profile_user = page.get('graphql', {}).get('user', {})
medias.extend(edge['node'] for edge in
profile_user.get('edge_owner_to_timeline_media', {}).get('edges', [])
if edge.get('node'))

# individual photo/video permalinks
for page in entry_data.get('PostPage', []):
media = page.get('graphql', {}).get('shortcode_media')
if media:
medias.append(media)
for match in matches:
data = util.trim_nulls(json_module.loads(match[1]))
entry_data = data.get('entry_data', {})

# home page ie news feed
for page in entry_data.get('FeedPage', []):
edges = page.get('graphql', {}).get('user', {})\
.get('edge_web_feed_timeline', {}).get('edges', [])
medias.extend(e.get('node') for e in edges
if e.get('node', {}).get('__typename') not in
('GraphSuggestedUserFeedUnit',))

if 'user' in data:
edges = data['user'].get('edge_web_feed_timeline', {}).get('edges', [])
medias.extend(e.get('node') for e in edges)

# profiles
for page in entry_data.get('ProfilePage', []):
profile_user = page.get('graphql', {}).get('user', {})
medias.extend(edge['node'] for edge in
profile_user.get('edge_owner_to_timeline_media', {}).get('edges', [])
if edge.get('node'))

# individual photo/video permalinks
for page in entry_data.get('PostPage', []):
media = page.get('graphql', {}).get('shortcode_media')
if media:
medias.append(media)

if not medias:
# As of 2018-02-15, embedded JSON in logged in https://www.instagram.com/
Expand All @@ -862,6 +863,8 @@ def html_to_activities(self, html, cookie=None, count=None, fetch_extras=False):

if count:
medias = medias[:count]

activities = []
for media in util.trim_nulls(medias):
shortcode = media.get('code') or media.get('shortcode')
likes = media.get('edge_media_preview_like') or {}
Expand Down
18 changes: 13 additions & 5 deletions granary/tests/test_instagram.py
Original file line number Diff line number Diff line change
Expand Up @@ -792,13 +792,14 @@ def tag_uri(name):
%s
...
<body>
<script type="text/javascript">window._sharedData = """
HTML_HEADER = HTML_HEADER_TEMPLATE % ''
<script type="text/javascript">%s"""
HTML_HEADER = HTML_HEADER_TEMPLATE % ('', 'window._sharedData = ')
HTML_PRELOAD_URL = '/graphql/query/?query_hash=cba321&variables={}'
HTML_HEADER_PRELOAD = HTML_HEADER_TEMPLATE % (
'<link rel="preload" href="%s" as="fetch" type="application/json" crossorigin />' %
HTML_PRELOAD_URL)
HTML_FOOTER = """
('<link rel="preload" href="%s" as="fetch" type="application/json" crossorigin />' %
HTML_PRELOAD_URL),
'window._sharedData = ')
HTML_FOOTER = """\
;</script>
<script src="//instagramstatic-a.akamaihd.net/h1/bundles/en_US_Commons.js/907dcce6a88a.js" type="text/javascript"></script>
...
Expand Down Expand Up @@ -948,6 +949,10 @@ def tag_uri(name):
HTML_ACTIVITIES_FULL_LIKES = [HTML_PHOTO_ACTIVITY_LIKES, HTML_VIDEO_ACTIVITY_FULL]

HTML_FEED_COMPLETE = HTML_HEADER + json.dumps(HTML_FEED) + HTML_FOOTER

HTML_HEADER_2 = HTML_HEADER_TEMPLATE % ('', "window.__additionalDataLoaded('feed', ")
HTML_FEED_COMPLETE_2 = HTML_HEADER_2 + json.dumps(HTML_PRELOAD_DATA['data']) + ')' + HTML_FOOTER

HTML_PROFILE_COMPLETE = HTML_HEADER + json.dumps(HTML_PROFILE) + HTML_FOOTER
HTML_PROFILE_PRIVATE_COMPLETE = HTML_HEADER + json.dumps(HTML_PROFILE_PRIVATE) + HTML_FOOTER
HTML_PHOTO_COMPLETE = HTML_HEADER + json.dumps(HTML_PHOTO_PAGE) + HTML_FOOTER
Expand Down Expand Up @@ -1536,6 +1541,9 @@ def test_html_to_activities_feed(self):
self.assert_equals(HTML_ACTIVITIES_FULL, activities)
self.assert_equals(HTML_VIEWER, viewer)

activities, viewer = self.instagram.html_to_activities(HTML_FEED_COMPLETE_2)
self.assert_equals(HTML_ACTIVITIES_FULL, activities)

def test_html_to_activities_profile(self):
activities, viewer = self.instagram.html_to_activities(HTML_PROFILE_COMPLETE)
self.assert_equals(HTML_ACTIVITIES, activities)
Expand Down

0 comments on commit 2839525

Please sign in to comment.