-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBuildIndex.java
49 lines (41 loc) · 1.58 KB
/
BuildIndex.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import java.io.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
public class BuildIndex {
public static void main(String[] args) {
String corpusFilePath = args[0];
// Extract file name from the full path
String fileName = new File(corpusFilePath).getName();
// Append .ser to the file name
String outputFileName = fileName.replace(".txt", ".ser");
try {
FileReader fileReader = new FileReader(corpusFilePath);
BufferedReader bufferReader = new BufferedReader(fileReader);
String line = null;
int count = 0;
List<List<String>> docs = new ArrayList<>();
List<String> doc = new ArrayList<>();
while ((line = bufferReader.readLine()) != null) {
String processed = line.replaceAll("[^a-zA-Z]", " ").toLowerCase();
doc.addAll(Arrays.asList(processed.trim().split("\\s+")));
count++;
if (count == 5) {
docs.add(doc);
doc = new ArrayList<>();
count = 0;
}
}
bufferReader.close();
Indexer idx = new Indexer();
idx.docs = docs;
FileOutputStream fos = new FileOutputStream(outputFileName);
ObjectOutputStream oos = new ObjectOutputStream(fos);
oos.writeObject(idx);
oos.close();
fos.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}