Skip to content

Commit

Permalink
Add User API & data pre-processing fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
qixils committed Jan 22, 2022
1 parent 88bcb65 commit 1b23ad8
Show file tree
Hide file tree
Showing 8 changed files with 160 additions and 69 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
.idea/
*.iml
groups/
/data/
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@ import java.time.OffsetDateTime

class Message(
var id: Int = 0,
var user: User = User(),
var authorId: Int = 0,
var alias: String = "",
var postDate: OffsetDateTime = OffsetDateTime.MIN,
var subject: String? = null,
var body: String = "",
Expand Down
7 changes: 5 additions & 2 deletions viewer-backend/src/main/kotlin/dev/qixils/yahoo/api/User.kt
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
package dev.qixils.yahoo.api

import java.util.*

class User (
var id: Int = 0,
var userName: String? = null,
var realName: String = "",
var displayName: String = ""
var knownAliases: Set<String> = Collections.emptySet(),
var knownGroups: Set<String> = Collections.emptySet(),
var fakeAccount: Boolean = false
)
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,7 @@ package dev.qixils.yahoo.api.plugins

import com.google.gson.Gson
import com.google.gson.GsonBuilder
import dev.qixils.yahoo.api.Message
import dev.qixils.yahoo.api.MessageReference
import dev.qixils.yahoo.api.OffsetDateTimeAdapter
import dev.qixils.yahoo.api.Page
import dev.qixils.yahoo.api.*
import io.ktor.http.*
import io.ktor.server.application.*
import io.ktor.server.response.*
Expand All @@ -21,7 +18,8 @@ val gson: Gson = GsonBuilder()
.registerTypeAdapter(typeOf<OffsetDateTime>().javaType, OffsetDateTimeAdapter())
.disableHtmlEscaping()
.create()
val messages = HashMap<MessageReference, Message>()
val messages = HashMap<MessageReference, Message?>()
val users = HashMap<Int, User?>()
val messageIndices = HashMap<String, List<Int>?>()
val allGroups = HashSet<String>()
const val messagesPerPage = 50
Expand All @@ -32,6 +30,24 @@ fun Application.configureRouting() {
call.respond(mapOf("groups" to getGroups()))
}

get("/v1/user/{id}") {
val id: Int? = call.parameters["id"]?.toIntOrNull()
if (id == null) {
call.respond(
HttpStatusCode.BadRequest,
mapOf("error" to "The argument 'id' could not be parsed")
)
} else if (!userExists(id)) {
call.respond(
HttpStatusCode.NotFound,
mapOf("error" to "A user with the ID of '$id' could not be found")
)
} else {
val user: User = getUser(id)!!
call.respond(user)
}
}

get("/v1/message/{group}/{id}") {
val group: String = call.parameters["group"] as String
val id: Int? = call.parameters["id"]?.toIntOrNull()
Expand Down Expand Up @@ -103,24 +119,37 @@ fun Application.configureRouting() {
}
}

fun userExists(id: Int): Boolean {
return getUser(id) != null
}

fun getUser(id: Int): User? {
if (users.containsKey(id)) return users[id]
val file = File("data/groups/${id}.json")
if (!file.exists()) {
users[id] = null
return null
}
val user: User = gson.fromJson(File("data/groups/${id}.json").readText(), User::class.java)
users[id] = user
return user
}

fun getGroups(): Set<String> {
if (allGroups.isNotEmpty()) return allGroups
val directoryURL = Application::class.java.getResource("/groups/")
val directory = File(directoryURL!!.path)
val directory = File("data/groups/")
for (file in directory.list()!!)
allGroups.add(file)
return allGroups
}

fun isValid(group: String): Boolean {
if (messageIndices.containsKey(group)) return messageIndices[group] != null
val clazz = Application::class.java
val baseURL = clazz.getResource("/groups/$group/")
if (baseURL == null) {
val directory = File("data/groups/$group/")
if (!directory.exists()) {
messageIndices[group] = null
return false
}
val directory = File(baseURL.path)
val files = directory.list { _: File, filename: String -> filename.endsWith(".json") }
if (files == null) {
messageIndices[group] = null
Expand All @@ -142,14 +171,14 @@ fun isValid(group: String): Boolean {
}

fun fetchMessage(reference: MessageReference): Message? {
val cache: Message? = messages[reference]
if (cache != null)
return cache

val text: String = Application::class.java
.getResource("/groups/${reference.group}/${reference.id}.json")
?.readText() ?: return null
if (messages.containsKey(reference)) return messages[reference]

val file = File("data/groups/${reference.group}/${reference.id}.json")
if (!file.exists()) {
messages[reference] = null
return null
}
val text: String = file.readText()
val value: Message = gson.fromJson(text, Message::class.java)
messages[reference] = value
return value
Expand Down
2 changes: 0 additions & 2 deletions viewer-frontend/src/routes/group/[group]/[page].svelte
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
<!--suppress TypeScriptUnresolvedVariable-->
<!--(IntelliJ is for some reason not parsing the tsconfig.json file)-->
<script context="module">
/** @type {import('@sveltejs/kit').Load} */
export async function load({ params, fetch, session, stuff }) {
Expand Down
1 change: 1 addition & 0 deletions viewer-frontend/src/routes/index.svelte
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import Error from "$lib/Error.svelte";
<p>Loading groups, please wait...</p>
{:then group_data}
<!-- Form theming adopted from https://flowbite.com/docs/components/forms/ -->
<!-- TODO: once form is expanded, should totally use the fancy gradient outline button from https://flowbite.com/docs/components/buttons/ -->
<select bind:value={selected} on:change='{() => window.location.href = "/group/" + selected + "/1"}' class="bg-gray-50 border border-gray-300 text-gray-900 text-sm rounded-lg focus:ring-blue-500 focus:border-blue-500 block w-full p-2.5 dark:bg-gray-700 dark:border-gray-600 dark:placeholder-gray-400 dark:text-white dark:focus:ring-blue-500 dark:focus:border-blue-500">
{#each group_data.groups as group}
<option value={group}>{group}</option>
Expand Down
2 changes: 1 addition & 1 deletion warc-extractor/.gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
*.warc
/archives/
/venv/
possible_errors.txt
147 changes: 103 additions & 44 deletions warc-extractor/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,34 +10,42 @@

MESSAGE_ID = re.compile(r"^org\.archive\.yahoogroups:v1/group/[a-z_]+/message/(\d+)/raw$")
LINEBREAKS = re.compile(r"\r?\n")
SECTION_PREFIX = re.compile(r"^(?:-{5,}_?=_(?:Next)?Part_|--\d+-\d+-\d+=:\d+|Content-Type: )") # some bizarre prefix that is found in some messages
SECTION_SUFFIX = re.compile(r"^ ?Yahoo! Mail")
SECTION_PREFIX = re.compile(r"^(?:-{5,}_?=_(?:Next)?Part_|--\d+-\d+-\d+=:\d+|Content-Type:)") # some bizarre prefix that is found in some messages
SECTION_SUFFIX = re.compile(r"^Yahoo! Mail")
HYPHENS = re.compile(r"^[-_]+$")
FAKE_ID_MAX = 1000000


def get_body(email: str) -> str:
# TODO: still imperfect; needs more tuning
# TODO: handle weird = truncations
email = LINEBREAKS.sub("\n", email).strip()
found: bool = False
has_section: bool = False
lines = []
line_count = 0
for line in email.split('\n'):
line = line.strip()
line_has_section = SECTION_PREFIX.match(line)
line_has_suffix = SECTION_SUFFIX.match(line)
if found and (line_has_suffix or (has_section and line_has_section)) and len(lines) > 0:
if found and line_has_section and len(lines) == 0:
found = False
lines.clear()
has_section = True
elif found and has_section and (line_has_suffix or line_has_section):
break
elif found and line_has_section:
found = False
lines.clear()
has_section = True
elif found and len(line) == 0 and len(lines) == 0:
pass
elif found:
lines.append(line)
elif len(line) == 0:
found = True

# remove trailing "--------"s
if len(lines) > 0 and HYPHENS.match(lines[-1]):
while len(lines) > 0 and HYPHENS.match(lines[-1]):
lines = lines[:-1]

output = html.unescape('\n'.join(lines).strip())
Expand All @@ -51,45 +59,96 @@ def get_body(email: str) -> str:
return output


def run(params: typing.List[str]):
if len(params) == 0:
print("No file was specified")
exit(1)
filename = params[0]
if not os.path.exists(filename):
print(f"Could not find a file by the name '{filename}'")
exit(1)
output_dir_base = "groups" if len(params) == 1 else params[1]
output_dir = os.path.join(output_dir_base, filename.split('.')[0])
with open(params[0], 'rb') as stream:
for record in ArchiveIterator(stream):
if record.rec_type != 'resource':
continue
match = MESSAGE_ID.match(record.rec_headers.get_header('WARC-Target-URI'))
if not match:
continue
message_id = match.group(1)
if not os.path.exists(output_dir_base):
os.mkdir(output_dir_base)
if not os.path.exists(output_dir):
os.mkdir(output_dir)
file_content = record.content_stream().read()
data = json.loads(file_content)
with open(os.path.join(output_dir, message_id + ".json"), 'w', encoding='UTF-8') as output:
json.dump({
"id": int(message_id),
"subject": html.unescape(data['subject']) if 'subject' in data else None,
"user": {
"displayName": html.unescape(data['from']),
"realName": data['authorName'],
"userName": data['profile'] if 'profile' in data else None,
"id": data['userId'],
},
"postDate": data['postDate'],
"body": get_body(data['rawEmail']),
"nextInTime": data['nextInTime']
}, output, separators=(',', ':'))
class Extractor:
userless_ids: typing.Dict[str, int] = {}
userless_id = 0

def __init__(self, output_dir_base: typing.Optional[str] = None):
self.output_dir_base: str = output_dir_base if output_dir_base is not None else "data"
self.user_data: typing.Dict[int, typing.Dict[str, typing.Any]] = self.load_user_data()

def load_user_data(self) -> typing.Dict[int, typing.Dict[str, typing.Any]]:
data: typing.Dict[int, typing.Dict[str, typing.Any]] = {}
data_dir = os.path.join(self.output_dir_base, "users")
os.makedirs(data_dir, exist_ok=True)
for file in os.listdir(data_dir):
with open(os.path.join(data_dir, file), 'r') as file_data:
json_data = json.load(file_data)
json_data["knownAliases"] = set(json_data["knownAliases"])
json_data["knownGroups"] = set(json_data["knownGroups"])
user_id: int = int(file.split('.')[0])
data[user_id] = json_data
if user_id < FAKE_ID_MAX:
self.userless_ids[json_data['knownAliases'][0]] = user_id
return data

def save_user_data(self):
for user_id, user_data in self.user_data.items():
user_data["knownAliases"] = list(set(user_data["knownAliases"]))
user_data["knownGroups"] = list(set(user_data["knownGroups"]))
with open(os.path.join(self.output_dir_base, "users", f"{user_id}.json"), 'w') as f:
json.dump(user_data, f)

def run(self, input_path: typing.Optional[str] = None):
input_path = input_path if input_path is not None else "archives"
for filename in os.listdir(input_path):
group = filename.split('.')[0]
filename = os.path.join(input_path, filename)
group_output_dir = os.path.join(self.output_dir_base, "groups", group)
with open(filename, 'rb') as stream:
for record in ArchiveIterator(stream):
self.process_record(record, group, group_output_dir)
self.save_user_data()

def _next_id(self) -> int:
self.userless_id += 1
return self.userless_id

def get_userless_id(self, alias: str) -> int:
if alias not in self.userless_ids:
self.userless_ids[alias] = self._next_id()
return self.userless_ids[alias]

def process_record(self, record, group: str, group_output_dir: str):
if record.rec_type != 'resource':
return
match = MESSAGE_ID.match(record.rec_headers.get_header('WARC-Target-URI'))
if not match:
return

message_id = match.group(1)
os.makedirs(group_output_dir, exist_ok=True)
file_content = record.content_stream().read()
data = json.loads(file_content)

alias: str = data['authorName'] if data['authorName'] else html.unescape(data['from']) # TODO: get "X-Sender" from email headers?
user_id: int = data['userId'] if data['userId'] != 0 else self.get_userless_id(alias)
if user_id in self.user_data:
aliases: set[str] = self.user_data[user_id]['knownAliases']
aliases.add(alias)
else:
self.user_data[user_id] = {
"userName": data['profile'] if 'profile' in data else None,
"knownAliases": {alias},
"knownGroups": {group},
"id": user_id,
"fakeAccount": user_id < 1000000
}

with open(os.path.join(group_output_dir, message_id + ".json"), 'w', encoding='UTF-8') as output:
json.dump({
"id": int(message_id),
"subject": html.unescape(data['subject']) if 'subject' in data else None,
"authorId": user_id,
"alias": alias,
"postDate": data['postDate'],
"body": get_body(data['rawEmail']),
"nextInTime": data['nextInTime']
}, output, separators=(',', ':'))


if __name__ == '__main__':
run(sys.argv[1:])
_args = sys.argv[1:]
_output = _args[0] if len(_args) > 0 else None
_input = _args[1] if len(_args) > 1 else None
Extractor(_output).run(_input)

0 comments on commit 1b23ad8

Please sign in to comment.