All Files Organized folder wise

patilakshay227 · Apr 30, 2018 · 7f6cb72 · 7f6cb72
1 parent ce5ebe6
commit 7f6cb72
Show file tree

Hide file tree

Showing 65 changed files with 635,011 additions and 0 deletions.
diff --git a/AvgMaleCommentLength.py b/AvgMaleCommentLength.py
@@ -0,0 +1,41 @@
+import sqlite3
+import nltk
+import string
+
+db = sqlite3.connect('../commentsData.db')
+c = db.cursor()
+
+c.execute("select commentBody,g.gender from comments c,commenterGender g where c.userID=g.userID and\
+            c.username=g.username and gender='male' or gender='female' ")
+
+totalWordsinMale=0
+commentsByMale=0
+
+
+totalWordsinFeMale=0
+commentsByFeMale=0
+progress=0
+
+for t in c.fetchall():
+    progress+=1
+    rawtokens=nltk.word_tokenize(t[0])
+
+    if t[1]=='male':
+        commentsByMale+=1
+    if t[1] == 'female':
+        commentsByFeMale+= 1
+
+    for w in rawtokens:
+        if w not in string.punctuation:
+            if t[1] == 'male':
+                totalWordsinMale += 1
+            if t[1] == 'female':
+                totalWordsinFeMale += 1
+    if progress%50000==0:
+        print progress," Comments processed"
+
+print "total words by male",totalWordsinMale," total comments by male",commentsByMale
+print "total words by female",totalWordsinFeMale," total comments by male",commentsByFeMale
+
+print "male avg words per comment",(totalWordsinMale/commentsByMale)
+print "female avg words per comment",(totalWordsinFeMale/commentsByFeMale)
diff --git a/Extra/keywordExtract.py b/Extra/keywordExtract.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python2
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Mar  8 20:45:43 2018
+
+@author: Ashwin
+"""
+import simplejson as json
+import pickle
+import datetime
+import sqlite3
+import traceback
+import dateutil.parser as dp
+
+inputFile = "../../Project/articleJSON.txt"
+
+db = sqlite3.connect('../commentdata.db')
+
+c = db.cursor()
+
+c.execute('CREATE TABLE IF NOT EXISTS ArticleKeywords(id text,keyword text,PRIMARY KEY(id,keyword) ) ')
+
+
+def writeArticleInDB(article):
+    ID = article['_id']
+    keywords = article['keywords']
+    sqlStat = "INSERT INTO ArticleKeywords VALUES(?,?)"
+    records=[]
+    if article:
+        for k in keywords:
+            records.append((ID,k['value']))
+    if len(records) > 0:
+        c.executemany(sqlStat,records)
+
+def parseFile():
+    noOfLinesParsed = 0
+    with open(inputFile) as f:
+        for line in f:
+            try:
+                noOfLinesParsed += 1
+                line = json.loads(line)
+
+                for record in line['response']['docs']:
+                    writeArticleInDB(record)
+
+                if (noOfLinesParsed % 1000 == 0):
+                    print "No of lines Parsed : ", noOfLinesParsed
+            except Exception as e:
+                if e.message!='response':
+                    with open("log", "a") as log:
+                        log.write("Error on line " + str(noOfLinesParsed) + "\n")
+                        log.write(traceback.format_exc())
+
+        db.commit()
+
+
+if __name__ == "__main__":
+    parseFile()
+    db.close()
diff --git a/Extra/licwtry.py b/Extra/licwtry.py
@@ -0,0 +1,36 @@
+import os
+
+data=dict()
+base='/home/ashwin/Downloads/Results/male/'
+once=0
+indexes=[]
+total=0
+res=[]
+res.append("male")
+for i in range(1,82):
+        res.append(0.0)
+
+for filename in os.listdir(base):
+    with open(base+filename,"r") as f:
+
+        first = 0
+        for line in f:
+            line=line.rstrip()
+            if once==0:
+                indexes=line.split("\t")
+                once=1
+
+            if first==0:
+                first=1
+            else:
+                total+=1
+                values=line.split("\t")
+                for i in range(1,len(values)):
+                    res[i]+=float(values[i])
+
+for i in range(0,82):
+    if i==0:
+        print indexes[i],"\t",res[i]
+    else:
+        print indexes[i],"\t",float(res[i])/total
+
diff --git a/Extra/liwcfemaletry.py b/Extra/liwcfemaletry.py
@@ -0,0 +1,36 @@
+import os
+
+data=dict()
+base='/home/ashwin/Downloads/Results/female/'
+once=0
+indexes=[]
+total=0
+res=[]
+res.append("female")
+for i in range(1,82):
+        res.append(0.0)
+
+for filename in os.listdir(base):
+    with open(base+filename,"r") as f:
+
+        first = 0
+        for line in f:
+            line=line.rstrip()
+            if once==0:
+                indexes=line.split("\t")
+                once=1
+
+            if first==0:
+                first=1
+            else:
+                total+=1
+                values=line.split("\t")
+                for i in range(1,len(values)):
+                    res[i]+=float(values[i])
+
+for i in range(0,82):
+    if i==0:
+        print indexes[i],"\t",res[i]
+    else:
+        print indexes[i],"\t",float(res[i])/total
+
diff --git a/Extra/updtateLocations.py b/Extra/updtateLocations.py
@@ -0,0 +1,94 @@
+import sqlite3
+
+db = sqlite3.connect('../../commentsData.db')
+c = db.cursor()
+
+stateToRegion = {
+    'Washington': "Pacific",
+    'Oregon':"Pacific",
+    'California': "Pacific",
+    'Hawaii' : "Pacific",
+    'Alaska': "Pacific",
+
+    'Montana':"Mountain",
+    'Idaho':"Mountain",
+    'Wyoming':"Mountain",
+    'Nevada':"Mountain",
+    'Utah': "Mountain",
+    'Colorado': "Mountain",
+    'Arizona':"Mountain",
+    'New Mexico': "Mountain",
+
+    'North Dakota': "West North Central",
+    'Minnesota':"West North Central",
+    'South Dakota':"West North Central",
+    'Nebraska':"West North Central",
+    'Iowa': "West North Central",
+    'Kansas':"West North Central",
+    'Missouri':"West North Central",
+
+    'Texas':"West South Central",
+    'Oklahoma':"West South Central",
+    'Arkansas':"West South Central",
+    'Louisiana': "West South Central",
+
+    'Wisconsin': "East North Central",
+    'Illinois': "East North Central",
+    'Indiana': "East North Central",
+    'Michigan' : "East North Central",
+    'Ohio': "East North Central",
+
+    'Kentucky': "East South Central",
+    'Tennessee': "East South Central",
+    'Mississippi': "East South Central",
+    'Alabama': "East South Central",
+
+    'West Virginia': "South Atlantic",
+    'Maryland': "South Atlantic",
+    'Virginia': "South Atlantic",
+    'North Carolina': "South Atlantic",
+    'South Carolina' : "South Atlantic",
+    'Georgia': "South Atlantic",
+    'Florida': "South Atlantic",
+    'Delaware': "South Atlantic",
+    'District of Columbia' : "South Atlantic",
+
+    'New York': "Middle Atlantic",
+    'New Jersey': "Middle Atlantic",
+    'Pennsylvania': "Middle Atlantic",
+
+    'Maine': "New England",
+    'Connecticut': "New England",
+    'Rhode Island': "New England",
+    'Massachusetts': "New England",
+    'New Hampshire': "New England",
+    'Vermont': "New England",
+
+
+    'Puerto Rico': "U.S. territory",
+    'United States Virgin Islands': "U.S. territory",
+    'American Samoa': "U.S. territory",
+    'Guam': "U.S. territory"
+
+}
+
+
+with open('./Result') as f:
+    for line in f:
+        if len(line.strip()) == 0:
+            continue
+        tok = line.split(';',1)
+        if len(tok) < 2:
+            print len(tok),line
+            continue
+        st = tok[0]
+        names = tok[1].split('\t')
+        print st
+        for n in names:
+            try:
+                c.execute('update comments set Location = ?,country = ?,locationRegion = ? where userLocation = ?', (st, 'US', stateToRegion[st], n))
+            except Exception as e:
+                with open('updateErrors','a') as ef:
+                    ef.write(str(st)+ str(n) + '\t' + str(e.message) + '\n')
+db.commit()
+db.close()
diff --git a/Graphs/AverageCommentsPerSection.png b/Graphs/AverageCommentsPerSection.png
diff --git a/Graphs/Commenter_NoOfArticles.png b/Graphs/Commenter_NoOfArticles.png
diff --git a/Graphs/Commenter_NoOfSections.png b/Graphs/Commenter_NoOfSections.png
diff --git a/Graphs/Figure_1-1.png b/Graphs/Figure_1-1.png
diff --git a/Graphs/Figure_1.png b/Graphs/Figure_1.png
diff --git a/Graphs/Gender_NoOfArticles.png b/Graphs/Gender_NoOfArticles.png
diff --git a/Graphs/Gender_NoOfSections.png b/Graphs/Gender_NoOfSections.png
diff --git a/Graphs/commentsDistribution.png b/Graphs/commentsDistribution.png
diff --git a/Graphs/sectionsDistribution.png b/Graphs/sectionsDistribution.png
diff --git a/Keywords/makeKeywordsTable.py b/Keywords/makeKeywordsTable.py
@@ -0,0 +1,39 @@
+import sqlite3
+import sexmachine.detector as gender
+import nltk
+import string
+
+db = sqlite3.connect('../../commentsData.db')
+c = db.cursor()
+
+c.execute("create table if not exists articleKeywords(webURL text,keyword text, PRIMARY KEY(webURL,keyword))")
+
+sqlstat="insert into articleKeywords values(?,?)"
+
+# c.execute("insert into maleComments select A.userID, A.username, A.commentBody from comments A join commenterGender B where B.gender="male" and A.userID = B.userID and A.username = B.username")
+
+
+c.execute("select webURL, keywords from articles")
+
+errCount = 0
+proc = 0
+for result in c.fetchall():
+    url = result[0]
+    keyword = result[1]
+    if len(keyword.strip()) == 0:
+        continue
+    tokens = keyword.split(';')
+
+    for kw in tokens:
+        try:
+            c.execute(sqlstat, (url, kw))
+        except sqlite3.IntegrityError as e:
+            errCount += 1
+
+    proc += 1
+    if proc % 1000 == 0:
+        print "articles processed : ", proc
+
+print errCount
+db.commit()
+c.close()
diff --git a/LIWC/analysisLIWC.py b/LIWC/analysisLIWC.py
@@ -0,0 +1,69 @@
+
+import sqlite3
+import os
+import sys
+from threading import Thread
+import threading
+reload(sys)
+sys.setdefaultencoding('utf8')
+
+
+baseDIR = '/home/akshay/IIT KGP/SEM 2/Complex Network/Github/Complex-Term-Project/Comments FOR LIWC/Male/'
+liwcDIR = '/home/akshay/IIT KGP/SEM 2/Complex Network/Github/Complex-Term-Project/Results/male/'
+
+count=0
+tlock = threading.Lock()
+
+
+def func1(f):
+    global count
+    db = sqlite3.connect('../../commentsData.db')
+
+    c = db.cursor()
+
+    fileName = baseDIR + f
+    outFile = open("../mappedLIWC/id" + f, "w")
+    liwcFile = open(liwcDIR + f.replace('.txt', '') + '_result.txt')
+    liwcFile.readline()
+    with open(fileName) as fo:
+        for line in fo:
+            line = line.strip()
+            if len(line) == 0 or line == '':
+                continue
+            liwcLine = liwcFile.readline().split('\t', 1)[1]
+            c.execute("select commentID from comments where commentBody like ?", ('%' + line + '%',))
+            t = c.fetchone()
+            if t is None:
+                #print 'none', line
+                continue
+            #print t[0], liwcLine
+            outFile.write(str(t[0]) + "\t\t" + liwcLine + '\n')
+            with tlock as t:
+                count+=1
+                if count%100==0:
+                    print "proccessed lines",count
+
+
+    outFile.close()
+    liwcFile.close()
+    db.close()
+
+#c.execute("select commentID from comments where commentBody = ?", (comment))
+#files = sorted(os.listdir(baseDIR), key= lambda x: os.path.getctime(baseDIR+x))
+threadslist=list()
+print "DB connected"
+
+for f in os.listdir(baseDIR):
+    t = Thread(target=func1, args=(f,))
+    threadslist.append(t)
+
+for t in threadslist:
+    t.start()
+for t in threadslist:
+    t.join()
+
+
+
+
+
+