Skip to content

Commit

Permalink
Precompute onegrams and bigrams and load on demand
Browse files Browse the repository at this point in the history
  • Loading branch information
rdeits committed Jan 9, 2013
1 parent 7432f66 commit 5925ba9
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 33 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -161,3 +161,4 @@ pip-log.txt

# Mac crap
.DS_Store
data
8 changes: 8 additions & 0 deletions Rakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
require 'rake/clean'
CLEAN.include("data/*.pck")

file "data/raw_onegrams.pck" => ["gen_ngrams.py"] do
sh "python gen_ngrams.py"
end

task :data => ["data/raw_onegrams.pck"]
40 changes: 7 additions & 33 deletions frequencies.py
Original file line number Diff line number Diff line change
@@ -1,46 +1,20 @@
from __future__ import division
from collections import defaultdict
import re
import cPickle as pickle

raw_onegrams = defaultdict(lambda: 0)

with open('raw_data/all.num.o5.txt', 'r') as f:
onegram_total = 0
for i, line in enumerate(f.readlines()):
if '!' in line or '&' in line:
continue
freq, word, pos, num_files = line.split(' ')
word = word.upper()
freq = int(freq)
raw_onegrams[word] += freq
onegram_total += freq

with open('data/raw_onegrams.pck', 'rb') as f:
raw_onegrams = pickle.load(f)
onegram_total = sum(raw_onegrams.values())
for word in raw_onegrams:
raw_onegrams[word] = raw_onegrams[word] / onegram_total

onegrams = defaultdict(lambda: 1 / onegram_total, raw_onegrams)

raw_bigrams = defaultdict(lambda: 0)

with open('raw_data/bigrams.txt', 'r') as f:
bigram_total = 0
for i, line in enumerate(f.readlines()):
words, count = line.split('\t')
count = int(count)
if count < 5:
continue
if re.search(r'[^a-zA-Z0-9 -_]', words) or re.search(r'0[A-Z]+\.0', words):
continue
words = words.upper()
word0, word1 = words.split(' ')
raw_bigrams[(word0, word1)] += count
bigram_total += count
# if i % 10000 == 0:
# print i, "/", "5612484"

with open('data/raw_bigrams.pck', 'rb') as f:
raw_bigrams = pickle.load(f)
bigram_total = sum(raw_bigrams.values())
for pair in raw_bigrams:
raw_bigrams[pair] = raw_bigrams[pair] / bigram_total

bigrams = defaultdict(lambda: 1 / bigram_total, raw_bigrams)


Expand Down
42 changes: 42 additions & 0 deletions gen_ngrams.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from __future__ import division
import cPickle as pickle
import re

raw_onegrams = {}

with open('raw_data/all.num.o5.txt', 'r') as f:
onegram_total = 0
for i, line in enumerate(f.readlines()):
if '!' in line or '&' in line:
continue
freq, word, pos, num_files = line.split(' ')
word = word.upper()
freq = int(freq)
if word not in raw_onegrams:
raw_onegrams[word] = 0
raw_onegrams[word] += freq
onegram_total += freq

with open('data/raw_onegrams.pck', 'wb') as f:
pickle.dump(raw_onegrams, f)

raw_bigrams = {}

with open('raw_data/bigrams.txt', 'r') as f:
bigram_total = 0
for i, line in enumerate(f.readlines()):
words, count = line.split('\t')
count = int(count)
if count < 5:
continue
if re.search(r'[^a-zA-Z0-9 -_]', words) or re.search(r'0[A-Z]+\.0', words):
continue
words = words.upper()
word0, word1 = words.split(' ')
if (word0, word1) not in raw_bigrams:
raw_bigrams[(word0, word1)] = 0
raw_bigrams[(word0, word1)] += count
bigram_total += count

with open('data/raw_bigrams.pck', 'wb') as f:
pickle.dump(raw_bigrams, f)

0 comments on commit 5925ba9

Please sign in to comment.