-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathb2.py
156 lines (138 loc) · 5.94 KB
/
b2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
__author__ = 'arashiyama'
import json
from pprint import pprint
import urllib2
from dateutil.parser import *
from datetime import datetime, date, time
from sqlalchemy import create_engine, String, Integer, Column
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
import logging
############################## GLOBALS ########################################
STREAM_HASH = '' # insert your datasift stream hash here
Base = declarative_base()
############################## CLASSES ########################################
class Bucket(Base):
__tablename__ = 'sandpit'
id = Column(Integer, primary_key=True)
handle = Column(String(50))
counter= Column(Integer)
def __init__(self, handle, counter):
self.handle = handle
self.counter = counter
def __repr__(self):
return "<Bucket('%s', '%d')>" % (self.handle, self.counter)
####################################### helpers ##############################
def get_bucket_name(twee,dtstamp):
dt = parse(dtstamp)
bucket = "bucket-%s-%s" % (twee,dt.strftime('%Y%b%d%H%M'))
return bucket
"""
# How to use:
complicated_dict = {"dict1key": {"dict2key": [{"dict3key": {"tell":"me"}}]}}
one_level_dict = makeSimpleDict(complicated_dict)
print one_level_dict
# prints out {'dict1key.dict2key.[0].dict3key.tell': u'me'}
"""
def makeSimpleDict(complex_dict):
"""
@param complex_dict : a python dict that might have inner dicts and arrays OR
a python list that might have inner dicts and arrays OR
a python object that's neither list of dict
@return plain_dict : plain dict that has only one level or a plain python object
"""
# make plain dict that you will return
plain_dict = {}
if isinstance(complex_dict, dict):
# if complex_dict is a dict
# loop through the keys of this complex dict
sawComplex = False
for complex_key in complex_dict:
# if complex_dict[complex_key] is a dict
if isinstance(complex_dict[complex_key], dict):
sub_dict = complex_dict[complex_key]
# loop through the keys of this sub_dict
sawComplex = True
for sub_key in sub_dict:
plain_dict[complex_key+"."+sub_key] = makeSimpleDict(sub_dict[sub_key])
elif isinstance(complex_dict[complex_key], list):
if not isComplexList(complex_dict[complex_key]):
plain_dict[complex_key] = getStrFromList(complex_dict[complex_key])
else:
sawComplex = True
for i in range(len(complex_dict[complex_key])):
plain_dict[complex_key+"["+str(i)+"]"] = makeSimpleDict(complex_dict[complex_key][i])
else:
plain_dict[complex_key] = makeSimpleDict(complex_dict[complex_key])
if sawComplex:
return makeSimpleDict(plain_dict)
else:
return plain_dict
else:
# if not a dict
if isinstance(complex_dict, list):
# if complex_dict is a list
# is complex_dict a list
# that contains a dict or an inner list?
if not isComplexList(complex_dict):
accum = getStrFromList(complex_dict)
plain_dict = accum
else:
# loop through the complex_dict
for i in range(len(complex_dict)):
plain_dict["["+str(i)+"]"] = makeSimpleDict(complex_dict[i])
return makeSimpleDict(plain_dict)
else:
# if neither a list nor a dict
return unicode(complex_dict)
def isComplexList(ls):
for each in ls:
if isinstance(each, dict) or isinstance(each, list):
return True
return False
def getStrFromList(ls):
if isinstance(ls, list):
return ", ".join([unicode(each) for each in ls])
else:
return ls
def fill_a_bucket(bucket_name):
# We need to check if the bucket exists, if it does, increment the counter
row = session.query(Bucket).filter(Bucket.handle == bucket_name)
if len(row.all()) == 0:
# no bucket exists, so create it with a counter of 1
red_bucket = Bucket(bucket_name, 1)
session.add(red_bucket)
print("Created a new red bucket with name %s" % bucket_name)
elif len(row.all()) == 1:
# We have an existing bucket so increment it
# print("If I only knew how to increment the counter for %s which is currently %d" % (row.handle, row.counter))
row.update({Bucket.counter: Bucket.counter + 1})
print("I think I added 1 to bucket %s" % bucket_name)
else:
# This is an error as we have more than one bucket with the same name. Karmically bad!
logging.error("More than one bucket found with bucket_name set to %s" % bucket_name)
# Commit to the database, feel good about ourselves for a microsecond, then get the next one
session.commit()
############################### MAIN ########################################
def DSLoop(hash=STREAM_HASH):
ds_url = r'https://stream.datasift.com/'+hash
ds_data = r'' # Insert your authorisation keys
request = urllib2.Request(ds_url, headers={'Authorization':ds_data})
for l in urllib2.urlopen(request):
the_data = makeSimpleDict(json.loads(l))
logging.debug("number of twitter fields received: %d" % len(the_data.keys()))
for i in sorted(the_data.keys()):
if r'twitter.created_at' in i:
bucket_name = get_bucket_name("TWT",the_data[i])
fill_a_bucket(bucket_name)
if r'retweet.created_at' in i:
bucket_name = get_bucket_name("RT",the_data[i])
fill_a_bucket(bucket_name)
if __name__=="__main__":
# set up data base
db = create_engine('sqlite:///DSdata.sqlite')
Session = sessionmaker(bind=db)
session = Session()
Base.metadata.create_all(db)
# bind data to classes
DSLoop(STREAM_HASH)