Add User API & data pre-processing fixes

qixils · Jan 22, 2022 · 1b23ad8 · 1b23ad8
1 parent 88bcb65
commit 1b23ad8
Show file tree

Hide file tree

Showing 8 changed files with 160 additions and 69 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,3 @@
 .idea/
 *.iml
-groups/
+/data/
diff --git a/viewer-backend/src/main/kotlin/dev/qixils/yahoo/api/Message.kt b/viewer-backend/src/main/kotlin/dev/qixils/yahoo/api/Message.kt
@@ -4,7 +4,8 @@ import java.time.OffsetDateTime
 
 class Message(
     var id: Int = 0,
-    var user: User = User(),
+    var authorId: Int = 0,
+    var alias: String = "",
     var postDate: OffsetDateTime = OffsetDateTime.MIN,
     var subject: String? = null,
     var body: String = "",

diff --git a/viewer-backend/src/main/kotlin/dev/qixils/yahoo/api/User.kt b/viewer-backend/src/main/kotlin/dev/qixils/yahoo/api/User.kt
@@ -1,8 +1,11 @@
 package dev.qixils.yahoo.api
 
+import java.util.*
+
 class User (
     var id: Int = 0,
     var userName: String? = null,
-    var realName: String = "",
-    var displayName: String = ""
+    var knownAliases: Set<String> = Collections.emptySet(),
+    var knownGroups: Set<String> = Collections.emptySet(),
+    var fakeAccount: Boolean = false
 )
diff --git a/viewer-backend/src/main/kotlin/dev/qixils/yahoo/api/plugins/Routing.kt b/viewer-backend/src/main/kotlin/dev/qixils/yahoo/api/plugins/Routing.kt
@@ -2,10 +2,7 @@ package dev.qixils.yahoo.api.plugins
 
 import com.google.gson.Gson
 import com.google.gson.GsonBuilder
-import dev.qixils.yahoo.api.Message
-import dev.qixils.yahoo.api.MessageReference
-import dev.qixils.yahoo.api.OffsetDateTimeAdapter
-import dev.qixils.yahoo.api.Page
+import dev.qixils.yahoo.api.*
 import io.ktor.http.*
 import io.ktor.server.application.*
 import io.ktor.server.response.*
@@ -21,7 +18,8 @@ val gson: Gson = GsonBuilder()
     .registerTypeAdapter(typeOf<OffsetDateTime>().javaType, OffsetDateTimeAdapter())
     .disableHtmlEscaping()
     .create()
-val messages = HashMap<MessageReference, Message>()
+val messages = HashMap<MessageReference, Message?>()
+val users = HashMap<Int, User?>()
 val messageIndices = HashMap<String, List<Int>?>()
 val allGroups = HashSet<String>()
 const val messagesPerPage = 50
@@ -32,6 +30,24 @@ fun Application.configureRouting() {
             call.respond(mapOf("groups" to getGroups()))
         }
 
+        get("/v1/user/{id}") {
+            val id: Int? = call.parameters["id"]?.toIntOrNull()
+            if (id == null) {
+                call.respond(
+                    HttpStatusCode.BadRequest,
+                    mapOf("error" to "The argument 'id' could not be parsed")
+                )
+            } else if (!userExists(id)) {
+                call.respond(
+                    HttpStatusCode.NotFound,
+                    mapOf("error" to "A user with the ID of '$id' could not be found")
+                )
+            } else {
+                val user: User = getUser(id)!!
+                call.respond(user)
+            }
+        }
+
         get("/v1/message/{group}/{id}") {
             val group: String = call.parameters["group"] as String
             val id: Int? = call.parameters["id"]?.toIntOrNull()
@@ -103,24 +119,37 @@ fun Application.configureRouting() {
     }
 }
 
+fun userExists(id: Int): Boolean {
+    return getUser(id) != null
+}
+
+fun getUser(id: Int): User? {
+    if (users.containsKey(id)) return users[id]
+    val file = File("data/groups/${id}.json")
+    if (!file.exists()) {
+        users[id] = null
+        return null
+    }
+    val user: User = gson.fromJson(File("data/groups/${id}.json").readText(), User::class.java)
+    users[id] = user
+    return user
+}
+
 fun getGroups(): Set<String> {
     if (allGroups.isNotEmpty()) return allGroups
-    val directoryURL = Application::class.java.getResource("/groups/")
-    val directory = File(directoryURL!!.path)
+    val directory = File("data/groups/")
     for (file in directory.list()!!)
         allGroups.add(file)
     return allGroups
 }
 
 fun isValid(group: String): Boolean {
     if (messageIndices.containsKey(group)) return messageIndices[group] != null
-    val clazz = Application::class.java
-    val baseURL = clazz.getResource("/groups/$group/")
-    if (baseURL == null) {
+    val directory = File("data/groups/$group/")
+    if (!directory.exists()) {
         messageIndices[group] = null
         return false
     }
-    val directory = File(baseURL.path)
     val files = directory.list { _: File, filename: String -> filename.endsWith(".json") }
     if (files == null) {
         messageIndices[group] = null
@@ -142,14 +171,14 @@ fun isValid(group: String): Boolean {
 }
 
 fun fetchMessage(reference: MessageReference): Message? {
-    val cache: Message? = messages[reference]
-    if (cache != null)
-        return cache
-
-    val text: String = Application::class.java
-        .getResource("/groups/${reference.group}/${reference.id}.json")
-        ?.readText() ?: return null
+    if (messages.containsKey(reference)) return messages[reference]
 
+    val file = File("data/groups/${reference.group}/${reference.id}.json")
+    if (!file.exists()) {
+        messages[reference] = null
+        return null
+    }
+    val text: String = file.readText()
     val value: Message = gson.fromJson(text, Message::class.java)
     messages[reference] = value
     return value

diff --git a/viewer-frontend/src/routes/group/[group]/[page].svelte b/viewer-frontend/src/routes/group/[group]/[page].svelte
@@ -1,5 +1,3 @@
-<!--suppress TypeScriptUnresolvedVariable-->
-<!--(IntelliJ is for some reason not parsing the tsconfig.json file)-->
 <script context="module">
   /** @type {import('@sveltejs/kit').Load} */
   export async function load({ params, fetch, session, stuff }) {

diff --git a/viewer-frontend/src/routes/index.svelte b/viewer-frontend/src/routes/index.svelte
@@ -33,6 +33,7 @@ import Error from "$lib/Error.svelte";
 			<p>Loading groups, please wait...</p>
 		{:then group_data}
 			<!-- Form theming adopted from https://flowbite.com/docs/components/forms/ -->
+      <!-- TODO: once form is expanded, should totally use the fancy gradient outline button from https://flowbite.com/docs/components/buttons/ -->
 			<select bind:value={selected} on:change='{() => window.location.href = "/group/" + selected + "/1"}' class="bg-gray-50 border border-gray-300 text-gray-900 text-sm rounded-lg focus:ring-blue-500 focus:border-blue-500 block w-full p-2.5 dark:bg-gray-700 dark:border-gray-600 dark:placeholder-gray-400 dark:text-white dark:focus:ring-blue-500 dark:focus:border-blue-500">
 				{#each group_data.groups as group}
 					<option value={group}>{group}</option>

diff --git a/warc-extractor/.gitignore b/warc-extractor/.gitignore
@@ -1,3 +1,3 @@
-*.warc
+/archives/
 /venv/
 possible_errors.txt
diff --git a/warc-extractor/main.py b/warc-extractor/main.py
@@ -10,34 +10,42 @@
 
 MESSAGE_ID = re.compile(r"^org\.archive\.yahoogroups:v1/group/[a-z_]+/message/(\d+)/raw$")
 LINEBREAKS = re.compile(r"\r?\n")
-SECTION_PREFIX = re.compile(r"^(?:-{5,}_?=_(?:Next)?Part_|--\d+-\d+-\d+=:\d+|Content-Type: )")  # some bizarre prefix that is found in some messages
-SECTION_SUFFIX = re.compile(r"^ ?Yahoo! Mail")
+SECTION_PREFIX = re.compile(r"^(?:-{5,}_?=_(?:Next)?Part_|--\d+-\d+-\d+=:\d+|Content-Type:)")  # some bizarre prefix that is found in some messages
+SECTION_SUFFIX = re.compile(r"^Yahoo! Mail")
 HYPHENS = re.compile(r"^[-_]+$")
+FAKE_ID_MAX = 1000000
 
 
 def get_body(email: str) -> str:
-    # TODO: still imperfect; needs more tuning
     # TODO: handle weird = truncations
     email = LINEBREAKS.sub("\n", email).strip()
     found: bool = False
     has_section: bool = False
     lines = []
+    line_count = 0
     for line in email.split('\n'):
+        line = line.strip()
         line_has_section = SECTION_PREFIX.match(line)
         line_has_suffix = SECTION_SUFFIX.match(line)
-        if found and (line_has_suffix or (has_section and line_has_section)) and len(lines) > 0:
+        if found and line_has_section and len(lines) == 0:
+            found = False
+            lines.clear()
+            has_section = True
+        elif found and has_section and (line_has_suffix or line_has_section):
             break
         elif found and line_has_section:
             found = False
             lines.clear()
             has_section = True
+        elif found and len(line) == 0 and len(lines) == 0:
+            pass
         elif found:
             lines.append(line)
         elif len(line) == 0:
             found = True
 
     # remove trailing "--------"s
-    if len(lines) > 0 and HYPHENS.match(lines[-1]):
+    while len(lines) > 0 and HYPHENS.match(lines[-1]):
         lines = lines[:-1]
 
     output = html.unescape('\n'.join(lines).strip())
@@ -51,45 +59,96 @@ def get_body(email: str) -> str:
         return output
 
 
-def run(params: typing.List[str]):
-    if len(params) == 0:
-        print("No file was specified")
-        exit(1)
-    filename = params[0]
-    if not os.path.exists(filename):
-        print(f"Could not find a file by the name '{filename}'")
-        exit(1)
-    output_dir_base = "groups" if len(params) == 1 else params[1]
-    output_dir = os.path.join(output_dir_base, filename.split('.')[0])
-    with open(params[0], 'rb') as stream:
-        for record in ArchiveIterator(stream):
-            if record.rec_type != 'resource':
-                continue
-            match = MESSAGE_ID.match(record.rec_headers.get_header('WARC-Target-URI'))
-            if not match:
-                continue
-            message_id = match.group(1)
-            if not os.path.exists(output_dir_base):
-                os.mkdir(output_dir_base)
-            if not os.path.exists(output_dir):
-                os.mkdir(output_dir)
-            file_content = record.content_stream().read()
-            data = json.loads(file_content)
-            with open(os.path.join(output_dir, message_id + ".json"), 'w', encoding='UTF-8') as output:
-                json.dump({
-                    "id": int(message_id),
-                    "subject": html.unescape(data['subject']) if 'subject' in data else None,
-                    "user": {
-                        "displayName": html.unescape(data['from']),
-                        "realName": data['authorName'],
-                        "userName": data['profile'] if 'profile' in data else None,
-                        "id": data['userId'],
-                    },
-                    "postDate": data['postDate'],
-                    "body": get_body(data['rawEmail']),
-                    "nextInTime": data['nextInTime']
-                }, output, separators=(',', ':'))
+class Extractor:
+    userless_ids: typing.Dict[str, int] = {}
+    userless_id = 0
+
+    def __init__(self, output_dir_base: typing.Optional[str] = None):
+        self.output_dir_base: str = output_dir_base if output_dir_base is not None else "data"
+        self.user_data: typing.Dict[int, typing.Dict[str, typing.Any]] = self.load_user_data()
+
+    def load_user_data(self) -> typing.Dict[int, typing.Dict[str, typing.Any]]:
+        data: typing.Dict[int, typing.Dict[str, typing.Any]] = {}
+        data_dir = os.path.join(self.output_dir_base, "users")
+        os.makedirs(data_dir, exist_ok=True)
+        for file in os.listdir(data_dir):
+            with open(os.path.join(data_dir, file), 'r') as file_data:
+                json_data = json.load(file_data)
+                json_data["knownAliases"] = set(json_data["knownAliases"])
+                json_data["knownGroups"] = set(json_data["knownGroups"])
+                user_id: int = int(file.split('.')[0])
+                data[user_id] = json_data
+                if user_id < FAKE_ID_MAX:
+                    self.userless_ids[json_data['knownAliases'][0]] = user_id
+        return data
+
+    def save_user_data(self):
+        for user_id, user_data in self.user_data.items():
+            user_data["knownAliases"] = list(set(user_data["knownAliases"]))
+            user_data["knownGroups"] = list(set(user_data["knownGroups"]))
+            with open(os.path.join(self.output_dir_base, "users", f"{user_id}.json"), 'w') as f:
+                json.dump(user_data, f)
+
+    def run(self, input_path: typing.Optional[str] = None):
+        input_path = input_path if input_path is not None else "archives"
+        for filename in os.listdir(input_path):
+            group = filename.split('.')[0]
+            filename = os.path.join(input_path, filename)
+            group_output_dir = os.path.join(self.output_dir_base, "groups", group)
+            with open(filename, 'rb') as stream:
+                for record in ArchiveIterator(stream):
+                    self.process_record(record, group, group_output_dir)
+        self.save_user_data()
+
+    def _next_id(self) -> int:
+        self.userless_id += 1
+        return self.userless_id
+
+    def get_userless_id(self, alias: str) -> int:
+        if alias not in self.userless_ids:
+            self.userless_ids[alias] = self._next_id()
+        return self.userless_ids[alias]
+
+    def process_record(self, record, group: str, group_output_dir: str):
+        if record.rec_type != 'resource':
+            return
+        match = MESSAGE_ID.match(record.rec_headers.get_header('WARC-Target-URI'))
+        if not match:
+            return
+
+        message_id = match.group(1)
+        os.makedirs(group_output_dir, exist_ok=True)
+        file_content = record.content_stream().read()
+        data = json.loads(file_content)
+
+        alias: str = data['authorName'] if data['authorName'] else html.unescape(data['from'])  # TODO: get "X-Sender" from email headers?
+        user_id: int = data['userId'] if data['userId'] != 0 else self.get_userless_id(alias)
+        if user_id in self.user_data:
+            aliases: set[str] = self.user_data[user_id]['knownAliases']
+            aliases.add(alias)
+        else:
+            self.user_data[user_id] = {
+                "userName": data['profile'] if 'profile' in data else None,
+                "knownAliases": {alias},
+                "knownGroups": {group},
+                "id": user_id,
+                "fakeAccount": user_id < 1000000
+            }
+
+        with open(os.path.join(group_output_dir, message_id + ".json"), 'w', encoding='UTF-8') as output:
+            json.dump({
+                "id": int(message_id),
+                "subject": html.unescape(data['subject']) if 'subject' in data else None,
+                "authorId": user_id,
+                "alias": alias,
+                "postDate": data['postDate'],
+                "body": get_body(data['rawEmail']),
+                "nextInTime": data['nextInTime']
+            }, output, separators=(',', ':'))
 
 
 if __name__ == '__main__':
-    run(sys.argv[1:])
+    _args = sys.argv[1:]
+    _output = _args[0] if len(_args) > 0 else None
+    _input = _args[1] if len(_args) > 1 else None
+    Extractor(_output).run(_input)
-Original file line number
+Diff line change
@@ -1,3 +1,3 @@
     .idea/
     *.iml
-    groups/
+    /data/