-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
ce5ebe6
commit 7f6cb72
Showing
65 changed files
with
635,011 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
import sqlite3 | ||
import nltk | ||
import string | ||
|
||
db = sqlite3.connect('../commentsData.db') | ||
c = db.cursor() | ||
|
||
c.execute("select commentBody,g.gender from comments c,commenterGender g where c.userID=g.userID and\ | ||
c.username=g.username and gender='male' or gender='female' ") | ||
|
||
totalWordsinMale=0 | ||
commentsByMale=0 | ||
|
||
|
||
totalWordsinFeMale=0 | ||
commentsByFeMale=0 | ||
progress=0 | ||
|
||
for t in c.fetchall(): | ||
progress+=1 | ||
rawtokens=nltk.word_tokenize(t[0]) | ||
|
||
if t[1]=='male': | ||
commentsByMale+=1 | ||
if t[1] == 'female': | ||
commentsByFeMale+= 1 | ||
|
||
for w in rawtokens: | ||
if w not in string.punctuation: | ||
if t[1] == 'male': | ||
totalWordsinMale += 1 | ||
if t[1] == 'female': | ||
totalWordsinFeMale += 1 | ||
if progress%50000==0: | ||
print progress," Comments processed" | ||
|
||
print "total words by male",totalWordsinMale," total comments by male",commentsByMale | ||
print "total words by female",totalWordsinFeMale," total comments by male",commentsByFeMale | ||
|
||
print "male avg words per comment",(totalWordsinMale/commentsByMale) | ||
print "female avg words per comment",(totalWordsinFeMale/commentsByFeMale) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
#!/usr/bin/env python2 | ||
# -*- coding: utf-8 -*- | ||
""" | ||
Created on Thu Mar 8 20:45:43 2018 | ||
@author: Ashwin | ||
""" | ||
import simplejson as json | ||
import pickle | ||
import datetime | ||
import sqlite3 | ||
import traceback | ||
import dateutil.parser as dp | ||
|
||
inputFile = "../../Project/articleJSON.txt" | ||
|
||
db = sqlite3.connect('../commentdata.db') | ||
|
||
c = db.cursor() | ||
|
||
c.execute('CREATE TABLE IF NOT EXISTS ArticleKeywords(id text,keyword text,PRIMARY KEY(id,keyword) ) ') | ||
|
||
|
||
def writeArticleInDB(article): | ||
ID = article['_id'] | ||
keywords = article['keywords'] | ||
sqlStat = "INSERT INTO ArticleKeywords VALUES(?,?)" | ||
records=[] | ||
if article: | ||
for k in keywords: | ||
records.append((ID,k['value'])) | ||
if len(records) > 0: | ||
c.executemany(sqlStat,records) | ||
|
||
def parseFile(): | ||
noOfLinesParsed = 0 | ||
with open(inputFile) as f: | ||
for line in f: | ||
try: | ||
noOfLinesParsed += 1 | ||
line = json.loads(line) | ||
|
||
for record in line['response']['docs']: | ||
writeArticleInDB(record) | ||
|
||
if (noOfLinesParsed % 1000 == 0): | ||
print "No of lines Parsed : ", noOfLinesParsed | ||
except Exception as e: | ||
if e.message!='response': | ||
with open("log", "a") as log: | ||
log.write("Error on line " + str(noOfLinesParsed) + "\n") | ||
log.write(traceback.format_exc()) | ||
|
||
db.commit() | ||
|
||
|
||
if __name__ == "__main__": | ||
parseFile() | ||
db.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
import os | ||
|
||
data=dict() | ||
base='/home/ashwin/Downloads/Results/male/' | ||
once=0 | ||
indexes=[] | ||
total=0 | ||
res=[] | ||
res.append("male") | ||
for i in range(1,82): | ||
res.append(0.0) | ||
|
||
for filename in os.listdir(base): | ||
with open(base+filename,"r") as f: | ||
|
||
first = 0 | ||
for line in f: | ||
line=line.rstrip() | ||
if once==0: | ||
indexes=line.split("\t") | ||
once=1 | ||
|
||
if first==0: | ||
first=1 | ||
else: | ||
total+=1 | ||
values=line.split("\t") | ||
for i in range(1,len(values)): | ||
res[i]+=float(values[i]) | ||
|
||
for i in range(0,82): | ||
if i==0: | ||
print indexes[i],"\t",res[i] | ||
else: | ||
print indexes[i],"\t",float(res[i])/total | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
import os | ||
|
||
data=dict() | ||
base='/home/ashwin/Downloads/Results/female/' | ||
once=0 | ||
indexes=[] | ||
total=0 | ||
res=[] | ||
res.append("female") | ||
for i in range(1,82): | ||
res.append(0.0) | ||
|
||
for filename in os.listdir(base): | ||
with open(base+filename,"r") as f: | ||
|
||
first = 0 | ||
for line in f: | ||
line=line.rstrip() | ||
if once==0: | ||
indexes=line.split("\t") | ||
once=1 | ||
|
||
if first==0: | ||
first=1 | ||
else: | ||
total+=1 | ||
values=line.split("\t") | ||
for i in range(1,len(values)): | ||
res[i]+=float(values[i]) | ||
|
||
for i in range(0,82): | ||
if i==0: | ||
print indexes[i],"\t",res[i] | ||
else: | ||
print indexes[i],"\t",float(res[i])/total | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
import sqlite3 | ||
|
||
db = sqlite3.connect('../../commentsData.db') | ||
c = db.cursor() | ||
|
||
stateToRegion = { | ||
'Washington': "Pacific", | ||
'Oregon':"Pacific", | ||
'California': "Pacific", | ||
'Hawaii' : "Pacific", | ||
'Alaska': "Pacific", | ||
|
||
'Montana':"Mountain", | ||
'Idaho':"Mountain", | ||
'Wyoming':"Mountain", | ||
'Nevada':"Mountain", | ||
'Utah': "Mountain", | ||
'Colorado': "Mountain", | ||
'Arizona':"Mountain", | ||
'New Mexico': "Mountain", | ||
|
||
'North Dakota': "West North Central", | ||
'Minnesota':"West North Central", | ||
'South Dakota':"West North Central", | ||
'Nebraska':"West North Central", | ||
'Iowa': "West North Central", | ||
'Kansas':"West North Central", | ||
'Missouri':"West North Central", | ||
|
||
'Texas':"West South Central", | ||
'Oklahoma':"West South Central", | ||
'Arkansas':"West South Central", | ||
'Louisiana': "West South Central", | ||
|
||
'Wisconsin': "East North Central", | ||
'Illinois': "East North Central", | ||
'Indiana': "East North Central", | ||
'Michigan' : "East North Central", | ||
'Ohio': "East North Central", | ||
|
||
'Kentucky': "East South Central", | ||
'Tennessee': "East South Central", | ||
'Mississippi': "East South Central", | ||
'Alabama': "East South Central", | ||
|
||
'West Virginia': "South Atlantic", | ||
'Maryland': "South Atlantic", | ||
'Virginia': "South Atlantic", | ||
'North Carolina': "South Atlantic", | ||
'South Carolina' : "South Atlantic", | ||
'Georgia': "South Atlantic", | ||
'Florida': "South Atlantic", | ||
'Delaware': "South Atlantic", | ||
'District of Columbia' : "South Atlantic", | ||
|
||
'New York': "Middle Atlantic", | ||
'New Jersey': "Middle Atlantic", | ||
'Pennsylvania': "Middle Atlantic", | ||
|
||
'Maine': "New England", | ||
'Connecticut': "New England", | ||
'Rhode Island': "New England", | ||
'Massachusetts': "New England", | ||
'New Hampshire': "New England", | ||
'Vermont': "New England", | ||
|
||
|
||
'Puerto Rico': "U.S. territory", | ||
'United States Virgin Islands': "U.S. territory", | ||
'American Samoa': "U.S. territory", | ||
'Guam': "U.S. territory" | ||
|
||
} | ||
|
||
|
||
with open('./Result') as f: | ||
for line in f: | ||
if len(line.strip()) == 0: | ||
continue | ||
tok = line.split(';',1) | ||
if len(tok) < 2: | ||
print len(tok),line | ||
continue | ||
st = tok[0] | ||
names = tok[1].split('\t') | ||
print st | ||
for n in names: | ||
try: | ||
c.execute('update comments set Location = ?,country = ?,locationRegion = ? where userLocation = ?', (st, 'US', stateToRegion[st], n)) | ||
except Exception as e: | ||
with open('updateErrors','a') as ef: | ||
ef.write(str(st)+ str(n) + '\t' + str(e.message) + '\n') | ||
db.commit() | ||
db.close() |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
import sqlite3 | ||
import sexmachine.detector as gender | ||
import nltk | ||
import string | ||
|
||
db = sqlite3.connect('../../commentsData.db') | ||
c = db.cursor() | ||
|
||
c.execute("create table if not exists articleKeywords(webURL text,keyword text, PRIMARY KEY(webURL,keyword))") | ||
|
||
sqlstat="insert into articleKeywords values(?,?)" | ||
|
||
# c.execute("insert into maleComments select A.userID, A.username, A.commentBody from comments A join commenterGender B where B.gender="male" and A.userID = B.userID and A.username = B.username") | ||
|
||
|
||
c.execute("select webURL, keywords from articles") | ||
|
||
errCount = 0 | ||
proc = 0 | ||
for result in c.fetchall(): | ||
url = result[0] | ||
keyword = result[1] | ||
if len(keyword.strip()) == 0: | ||
continue | ||
tokens = keyword.split(';') | ||
|
||
for kw in tokens: | ||
try: | ||
c.execute(sqlstat, (url, kw)) | ||
except sqlite3.IntegrityError as e: | ||
errCount += 1 | ||
|
||
proc += 1 | ||
if proc % 1000 == 0: | ||
print "articles processed : ", proc | ||
|
||
print errCount | ||
db.commit() | ||
c.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
|
||
import sqlite3 | ||
import os | ||
import sys | ||
from threading import Thread | ||
import threading | ||
reload(sys) | ||
sys.setdefaultencoding('utf8') | ||
|
||
|
||
baseDIR = '/home/akshay/IIT KGP/SEM 2/Complex Network/Github/Complex-Term-Project/Comments FOR LIWC/Male/' | ||
liwcDIR = '/home/akshay/IIT KGP/SEM 2/Complex Network/Github/Complex-Term-Project/Results/male/' | ||
|
||
count=0 | ||
tlock = threading.Lock() | ||
|
||
|
||
def func1(f): | ||
global count | ||
db = sqlite3.connect('../../commentsData.db') | ||
|
||
c = db.cursor() | ||
|
||
fileName = baseDIR + f | ||
outFile = open("../mappedLIWC/id" + f, "w") | ||
liwcFile = open(liwcDIR + f.replace('.txt', '') + '_result.txt') | ||
liwcFile.readline() | ||
with open(fileName) as fo: | ||
for line in fo: | ||
line = line.strip() | ||
if len(line) == 0 or line == '': | ||
continue | ||
liwcLine = liwcFile.readline().split('\t', 1)[1] | ||
c.execute("select commentID from comments where commentBody like ?", ('%' + line + '%',)) | ||
t = c.fetchone() | ||
if t is None: | ||
#print 'none', line | ||
continue | ||
#print t[0], liwcLine | ||
outFile.write(str(t[0]) + "\t\t" + liwcLine + '\n') | ||
with tlock as t: | ||
count+=1 | ||
if count%100==0: | ||
print "proccessed lines",count | ||
|
||
|
||
outFile.close() | ||
liwcFile.close() | ||
db.close() | ||
|
||
#c.execute("select commentID from comments where commentBody = ?", (comment)) | ||
#files = sorted(os.listdir(baseDIR), key= lambda x: os.path.getctime(baseDIR+x)) | ||
threadslist=list() | ||
print "DB connected" | ||
|
||
for f in os.listdir(baseDIR): | ||
t = Thread(target=func1, args=(f,)) | ||
threadslist.append(t) | ||
|
||
for t in threadslist: | ||
t.start() | ||
for t in threadslist: | ||
t.join() | ||
|
||
|
||
|
||
|
||
|
||
|
Oops, something went wrong.