This repository has been archived by the owner on Oct 24, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathindex.py
70 lines (62 loc) · 2.58 KB
/
index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#!/usr/bin/python3
#
# Full text indexing of
#
import pickle
import sys
import math
import time
import os
from subprocess import Popen, PIPE
from whoosh.qparser import QueryParser
from whoosh.index import create_in, exists_in, open_dir
from whoosh.fields import Schema, ID, TEXT, DATETIME
class Indexor():
def __init__(self):
schema = Schema(session=ID(stored=True), content=TEXT(stored=True), start=DATETIME(stored=True),
end=DATETIME(stored=True), username=TEXT(stored=True), remote=TEXT(stored=True))
if exists_in("/var/recordings/index"):
self.idx = open_dir("/var/recordings/index")
else:
self.idx = create_in("/var/recordings/index", schema)
def index(self, session, start, end, username, remote):
recording = "/var/recordings/%s.json" % session
if not os.path.exists(recording):
return
writer = self.idx.writer()
try:
writer.add_document(session=session, start=start, end=end,
content=self._get_content(recording), username=username,
remote=remote)
finally:
writer.commit()
def _get_content(self, recording):
import io
import ijson
text = io.StringIO()
with open(recording, "rb") as f:
for lines in ijson.items(f, 'stdout.item'):
text.write(lines[1])
return text.getvalue()
def search(self, query, page, username, remote):
start = time.time()
if username:
query += " username:%s" % username
if remote:
query += " remote:%s" % remote
parser = QueryParser("content", schema=self.idx.schema)
query = parser.parse(query)
with self.idx.searcher() as s:
search_results = s.search_page(query, page, pagelen=10, terms=True)
hits = list()
result = dict(total_pages=math.ceil(len(search_results) / 10), page=hits)
for hit in search_results:
hits.append(dict(terms=[term.decode() for _, term in hit.matched_terms()],
session=hit['session'],
username=hit['username'],
remote=hit['remote'],
start=hit['start'].timestamp(),
end=hit['end'].timestamp(),
highlights=hit.highlights("content")))
result["stats"] = "Found %s hits in %s seconds." % (len(search_results), time.time() - start)
return result