Skip to content

Commit

Permalink
All Files Organized folder wise
Browse files Browse the repository at this point in the history
  • Loading branch information
patilakshay227 committed Apr 30, 2018
1 parent ce5ebe6 commit 7f6cb72
Show file tree
Hide file tree
Showing 65 changed files with 635,011 additions and 0 deletions.
41 changes: 41 additions & 0 deletions AvgMaleCommentLength.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import sqlite3
import nltk
import string

db = sqlite3.connect('../commentsData.db')
c = db.cursor()

c.execute("select commentBody,g.gender from comments c,commenterGender g where c.userID=g.userID and\
c.username=g.username and gender='male' or gender='female' ")

totalWordsinMale=0
commentsByMale=0


totalWordsinFeMale=0
commentsByFeMale=0
progress=0

for t in c.fetchall():
progress+=1
rawtokens=nltk.word_tokenize(t[0])

if t[1]=='male':
commentsByMale+=1
if t[1] == 'female':
commentsByFeMale+= 1

for w in rawtokens:
if w not in string.punctuation:
if t[1] == 'male':
totalWordsinMale += 1
if t[1] == 'female':
totalWordsinFeMale += 1
if progress%50000==0:
print progress," Comments processed"

print "total words by male",totalWordsinMale," total comments by male",commentsByMale
print "total words by female",totalWordsinFeMale," total comments by male",commentsByFeMale

print "male avg words per comment",(totalWordsinMale/commentsByMale)
print "female avg words per comment",(totalWordsinFeMale/commentsByFeMale)
59 changes: 59 additions & 0 deletions Extra/keywordExtract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 8 20:45:43 2018
@author: Ashwin
"""
import simplejson as json
import pickle
import datetime
import sqlite3
import traceback
import dateutil.parser as dp

inputFile = "../../Project/articleJSON.txt"

db = sqlite3.connect('../commentdata.db')

c = db.cursor()

c.execute('CREATE TABLE IF NOT EXISTS ArticleKeywords(id text,keyword text,PRIMARY KEY(id,keyword) ) ')


def writeArticleInDB(article):
ID = article['_id']
keywords = article['keywords']
sqlStat = "INSERT INTO ArticleKeywords VALUES(?,?)"
records=[]
if article:
for k in keywords:
records.append((ID,k['value']))
if len(records) > 0:
c.executemany(sqlStat,records)

def parseFile():
noOfLinesParsed = 0
with open(inputFile) as f:
for line in f:
try:
noOfLinesParsed += 1
line = json.loads(line)

for record in line['response']['docs']:
writeArticleInDB(record)

if (noOfLinesParsed % 1000 == 0):
print "No of lines Parsed : ", noOfLinesParsed
except Exception as e:
if e.message!='response':
with open("log", "a") as log:
log.write("Error on line " + str(noOfLinesParsed) + "\n")
log.write(traceback.format_exc())

db.commit()


if __name__ == "__main__":
parseFile()
db.close()
36 changes: 36 additions & 0 deletions Extra/licwtry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import os

data=dict()
base='/home/ashwin/Downloads/Results/male/'
once=0
indexes=[]
total=0
res=[]
res.append("male")
for i in range(1,82):
res.append(0.0)

for filename in os.listdir(base):
with open(base+filename,"r") as f:

first = 0
for line in f:
line=line.rstrip()
if once==0:
indexes=line.split("\t")
once=1

if first==0:
first=1
else:
total+=1
values=line.split("\t")
for i in range(1,len(values)):
res[i]+=float(values[i])

for i in range(0,82):
if i==0:
print indexes[i],"\t",res[i]
else:
print indexes[i],"\t",float(res[i])/total

36 changes: 36 additions & 0 deletions Extra/liwcfemaletry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import os

data=dict()
base='/home/ashwin/Downloads/Results/female/'
once=0
indexes=[]
total=0
res=[]
res.append("female")
for i in range(1,82):
res.append(0.0)

for filename in os.listdir(base):
with open(base+filename,"r") as f:

first = 0
for line in f:
line=line.rstrip()
if once==0:
indexes=line.split("\t")
once=1

if first==0:
first=1
else:
total+=1
values=line.split("\t")
for i in range(1,len(values)):
res[i]+=float(values[i])

for i in range(0,82):
if i==0:
print indexes[i],"\t",res[i]
else:
print indexes[i],"\t",float(res[i])/total

94 changes: 94 additions & 0 deletions Extra/updtateLocations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import sqlite3

db = sqlite3.connect('../../commentsData.db')
c = db.cursor()

stateToRegion = {
'Washington': "Pacific",
'Oregon':"Pacific",
'California': "Pacific",
'Hawaii' : "Pacific",
'Alaska': "Pacific",

'Montana':"Mountain",
'Idaho':"Mountain",
'Wyoming':"Mountain",
'Nevada':"Mountain",
'Utah': "Mountain",
'Colorado': "Mountain",
'Arizona':"Mountain",
'New Mexico': "Mountain",

'North Dakota': "West North Central",
'Minnesota':"West North Central",
'South Dakota':"West North Central",
'Nebraska':"West North Central",
'Iowa': "West North Central",
'Kansas':"West North Central",
'Missouri':"West North Central",

'Texas':"West South Central",
'Oklahoma':"West South Central",
'Arkansas':"West South Central",
'Louisiana': "West South Central",

'Wisconsin': "East North Central",
'Illinois': "East North Central",
'Indiana': "East North Central",
'Michigan' : "East North Central",
'Ohio': "East North Central",

'Kentucky': "East South Central",
'Tennessee': "East South Central",
'Mississippi': "East South Central",
'Alabama': "East South Central",

'West Virginia': "South Atlantic",
'Maryland': "South Atlantic",
'Virginia': "South Atlantic",
'North Carolina': "South Atlantic",
'South Carolina' : "South Atlantic",
'Georgia': "South Atlantic",
'Florida': "South Atlantic",
'Delaware': "South Atlantic",
'District of Columbia' : "South Atlantic",

'New York': "Middle Atlantic",
'New Jersey': "Middle Atlantic",
'Pennsylvania': "Middle Atlantic",

'Maine': "New England",
'Connecticut': "New England",
'Rhode Island': "New England",
'Massachusetts': "New England",
'New Hampshire': "New England",
'Vermont': "New England",


'Puerto Rico': "U.S. territory",
'United States Virgin Islands': "U.S. territory",
'American Samoa': "U.S. territory",
'Guam': "U.S. territory"

}


with open('./Result') as f:
for line in f:
if len(line.strip()) == 0:
continue
tok = line.split(';',1)
if len(tok) < 2:
print len(tok),line
continue
st = tok[0]
names = tok[1].split('\t')
print st
for n in names:
try:
c.execute('update comments set Location = ?,country = ?,locationRegion = ? where userLocation = ?', (st, 'US', stateToRegion[st], n))
except Exception as e:
with open('updateErrors','a') as ef:
ef.write(str(st)+ str(n) + '\t' + str(e.message) + '\n')
db.commit()
db.close()
Binary file added Graphs/AverageCommentsPerSection.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added Graphs/Commenter_NoOfArticles.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added Graphs/Commenter_NoOfSections.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added Graphs/Figure_1-1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added Graphs/Figure_1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added Graphs/Gender_NoOfArticles.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added Graphs/Gender_NoOfSections.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added Graphs/commentsDistribution.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added Graphs/sectionsDistribution.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
39 changes: 39 additions & 0 deletions Keywords/makeKeywordsTable.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import sqlite3
import sexmachine.detector as gender
import nltk
import string

db = sqlite3.connect('../../commentsData.db')
c = db.cursor()

c.execute("create table if not exists articleKeywords(webURL text,keyword text, PRIMARY KEY(webURL,keyword))")

sqlstat="insert into articleKeywords values(?,?)"

# c.execute("insert into maleComments select A.userID, A.username, A.commentBody from comments A join commenterGender B where B.gender="male" and A.userID = B.userID and A.username = B.username")


c.execute("select webURL, keywords from articles")

errCount = 0
proc = 0
for result in c.fetchall():
url = result[0]
keyword = result[1]
if len(keyword.strip()) == 0:
continue
tokens = keyword.split(';')

for kw in tokens:
try:
c.execute(sqlstat, (url, kw))
except sqlite3.IntegrityError as e:
errCount += 1

proc += 1
if proc % 1000 == 0:
print "articles processed : ", proc

print errCount
db.commit()
c.close()
69 changes: 69 additions & 0 deletions LIWC/analysisLIWC.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@

import sqlite3
import os
import sys
from threading import Thread
import threading
reload(sys)
sys.setdefaultencoding('utf8')


baseDIR = '/home/akshay/IIT KGP/SEM 2/Complex Network/Github/Complex-Term-Project/Comments FOR LIWC/Male/'
liwcDIR = '/home/akshay/IIT KGP/SEM 2/Complex Network/Github/Complex-Term-Project/Results/male/'

count=0
tlock = threading.Lock()


def func1(f):
global count
db = sqlite3.connect('../../commentsData.db')

c = db.cursor()

fileName = baseDIR + f
outFile = open("../mappedLIWC/id" + f, "w")
liwcFile = open(liwcDIR + f.replace('.txt', '') + '_result.txt')
liwcFile.readline()
with open(fileName) as fo:
for line in fo:
line = line.strip()
if len(line) == 0 or line == '':
continue
liwcLine = liwcFile.readline().split('\t', 1)[1]
c.execute("select commentID from comments where commentBody like ?", ('%' + line + '%',))
t = c.fetchone()
if t is None:
#print 'none', line
continue
#print t[0], liwcLine
outFile.write(str(t[0]) + "\t\t" + liwcLine + '\n')
with tlock as t:
count+=1
if count%100==0:
print "proccessed lines",count


outFile.close()
liwcFile.close()
db.close()

#c.execute("select commentID from comments where commentBody = ?", (comment))
#files = sorted(os.listdir(baseDIR), key= lambda x: os.path.getctime(baseDIR+x))
threadslist=list()
print "DB connected"

for f in os.listdir(baseDIR):
t = Thread(target=func1, args=(f,))
threadslist.append(t)

for t in threadslist:
t.start()
for t in threadslist:
t.join()






Loading

0 comments on commit 7f6cb72

Please sign in to comment.