wikispeedruns · bricehalder · Sep 3, 2024 · Jan 28, 2024 · Jun 19, 2024 · Sep 3, 2024
diff --git a/.gitignore b/.gitignore
@@ -7,6 +7,7 @@ config/prod.json
 
 node_modules
 frontend/static/js-build
+data/
 
 .DS_Store
 .vscode
diff --git a/apis/sprints_api.py b/apis/sprints_api.py
@@ -3,14 +3,22 @@
 
 import json
 import datetime
+import wsbot
 
 from app.db import get_db
 from pymysql.cursors import DictCursor
 
 from util.decorators import check_admin, check_request_json
 from wikispeedruns import prompts
+from wsbot.search import GreedySearch, BeamSearch
+from wsbot.embeddings import LocalEmbeddings
+from wsbot.graph import APIGraph, SQLGraph
 
 sprint_api = Blueprint('sprints', __name__, url_prefix='/api/sprints')
+    # this script doesn't work
+    # !./get_embeddings.sh
+embeddings_provider = LocalEmbeddings("data/wiki2vec.txt")
+graph_provider = APIGraph()
 
 
 ### Prompt Management Endpoints
@@ -168,3 +176,23 @@ def check_duplicate_prompt():
     res = prompts.check_for_sprint_duplicates(start, end)
     return jsonify(res)
 
+# get the next hint
+@sprint_api.get('/hint')
+# @check_request_json({"start": str, "end": str})
+def get_hint():
+    start = request.args.get('start')
+    end = request.args.get('end')
+
+    print(start)
+    print(end)
+
+    if (start is None or end is None): return "Invalid Request", 400
+
+    # which algorithm to use?
+    # greedy = GreedySearch(embeddings_provider, graph_provider)
+    # path = greedy.search(start, end)
+
+    beam = BeamSearch(embeddings_provider, graph_provider)
+    path = beam.search(start, end)
+
+    return path
diff --git a/frontend/static/js/pages/play.js b/frontend/static/js/pages/play.js
@@ -231,6 +231,40 @@ let app = new Vue({
 
         },
 
+        async getHint(start, end) {
+            document.getElementById('hint').innerText = "Getting hint..."
+            const searchParams = new URLSearchParams({
+                start: start,
+                end: end
+            })
+            console.log("HI")
+            console.log("/api/sprints/hint?" + searchParams)
+
+            let hint
+            try{
+                const response = await fetch("/api/sprints/hint?" + searchParams)
+                let tmpData = await response.json()
+                hint = tmpData[1]
+                if(!hint){
+                    throw err;
+                }
+            }
+            catch(err){
+                document.getElementById('hint').innerText = "Sorry, couldn't find a hint!"
+            }
+
+            // const response = await fetch("/api/sprints/hint?" + searchParams)
+            // let tmpData = await response.json()
+            // hint = tmpData[1]
+            // console.log(hint)
+            if(!hint){
+                document.getElementById('hint').innerText = "Sorry, couldn't find a hint!"
+            }else{
+                document.getElementById('hint').innerText = hint
+            }
+            return hint
+        },
+
         async start() {
             this.countdownTime = (Date.now() - this.startTime) / 1000;
 

diff --git a/frontend/templates/play.html b/frontend/templates/play.html
@@ -57,6 +57,12 @@
                             Current Article<br><strong>[[currentArticle]]</strong>
                         </div>
                     </div>
+                    <div style="float: right;" >
+                        <button class="col text-nowrap px-1 pt-1" @click="getHint([[currentArticle]], [[endArticle]])">
+                            Need a hint?
+                        </button>
+                        <div id="hint"></div>
+                    </div>
                 </div>
             </div>
             <div v-else class="HUDwrapper HUDwrapper-fade container-xxl">

diff --git a/get_embeddings.sh b/get_embeddings.sh
@@ -0,0 +1,11 @@
+#/bin/bash
+
+EMBEDDINGS_FILE="data/wiki2vec.txt.bz2"
+if [[ -f $EMBEDDINGS_FILE ]]; then
+    mkdir -p data
+    wget "http://wikipedia2vec.s3.amazonaws.com/models/en/2018-04-20/enwiki_20180420_100d.txt.bz2" -O $EMBEDDINGS_FILE.bz2
+    bunzip2 $EMBEDDINGS_FILE.bz2
+else
+    echo "\"$EMBEDDINGS_FILE\" already exists! Skipping..."
+
+fi
diff --git a/wsbot/__init__.py b/wsbot/__init__.py
diff --git a/wsbot/embeddings.py b/wsbot/embeddings.py
@@ -0,0 +1,17 @@
+from abc import ABC, abstractmethod
+from wikipedia2vec import Wikipedia2Vec
+
+class EmbeddingsProvider(ABC):
+    @abstractmethod
+    def get_embedding(self, article: str):
+        pass
+
+    def get_embeddings(self, articles): 
+        return [self.get_embeddings(a) for a in articles]
+
+class LocalEmbeddings(EmbeddingsProvider):
+    def __init__(self, filename: str):
+        self.wiki2vec = Wikipedia2Vec.load_text(filename)
+
+    def get_embedding(self, article: str):
+        return self.wiki2vec.get_entity_vector(article)
diff --git a/wsbot/graph.py b/wsbot/graph.py
@@ -0,0 +1,78 @@
+from abc import ABC, abstractmethod
+from wikipedia2vec import Wikipedia2Vec
+
+import pymysql
+from pymysql.cursors import DictCursor
+
+import requests
+
+# TODO make these context proveriders?
+class GraphProvider(ABC):
+    '''
+    Provide the outgoing links and other operations on the Wikipedia graph
+    '''
+
+    @abstractmethod
+    def get_links(self, article):
+        pass
+
+    def get_links_batch(self, articles):
+        return [self.get_links(a) for a in articles]
+
+
+class APIGraph(GraphProvider):
+    '''
+    Graph queries served by the public Wikipedia API
+    '''
+    URL = "https://en.wikipedia.org/w/api.php"
+    PARAMS = {
+        "action": "query",
+        "format": "json",
+        "prop": "links",
+        "pllimit": "max"
+    }
+
+    def __init__(self):
+        pass
+
+    def _links_from_resp(self, resp):
+        links = list(resp["query"]["pages"].values())[0]["links"]
+        links = [link["title"] for link in links]
+        return list(filter(lambda title: ":" not in title, links))
+
+    def get_links(self, article):
+        resp = requests.get(self.URL, params={**self.PARAMS, "titles": article}).json() 
+        return self._links_from_resp(resp)
+
+    def get_links_batch(self, articles):
+        # TODO figure out what happens if this returns too much
+        resp = requests.get(url, params={**self.PARAMS, "titles": "|".join(articles)}).json()
+        return self._links_from_resp(resp) 
+
+
+class SQLGraph(GraphProvider):
+    '''
+    Graph queries served by the custom wikipedia speedruns SQL database graph
+    '''
+    def __init__(self, host, user, password, database):
+        self.db = pymysql.connect(host=host, user=user, password=password, database=database)
+        self.cursor = self.db.cursor(cursor=DictCursor)
+
+    def get_links(self, article):
+        id_query = "SELECT * FROM articleid WHERE name=%s"
+        edge_query = """
+            SELECT a.name FROM edgeidarticleid AS e 
+            JOIN articleid AS a
+            ON e.dest = a.articleID
+            WHERE e.src = %s
+        """
+        self.cursor.execute(id_query, article)
+        article_id = self.cursor.fetchone()["articleID"]
+        if article_id is None: return None
+
+        self.cursor.execute(edge_query, article_id)
+
+        return [row["name"] for row in self.cursor.fetchall()]
+
+    # TODO write a query that does this properly 
+    #def get_links_batch(self, articles):
diff --git a/wsbot/search.py b/wsbot/search.py
@@ -0,0 +1,114 @@
+import scipy
+from scipy.spatial import distance
+
+# TODO base class
+
+class MaxIterationsException(Exception):
+    pass
+
+class PathNotFoundException(Exception):
+    pass
+
+
+class GreedySearch:
+    def __init__(self, embedding_provider, graph_provider, max_iterations=20):
+        self.embeddings = embedding_provider
+        self.graph = graph_provider
+        self.max_iterations = max_iterations
+
+
+    def search(self, start: str, end: str):
+        # Greedily searches the wikipedia graph
+        cur = start
+        end_v = self.embeddings.get_embedding(end)
+
+        ret = [start, ]
+
+        for i in range(self.max_iterations):
+            min_dist = 2
+            next_article = ""
+
+            for link in self.graph.get_links(cur):    
+                if link in ret:
+                    continue
+
+                if (link == end): 
+                    #print(f"Found link in {cur}!")
+                    ret.append(link)
+                    return ret
+
+                try: 
+                    cur_v = self.embeddings.get_embedding(link)
+                except KeyError:    
+                    continue
+
+                dist = distance.cosine(cur_v, end_v)
+
+                if dist <= min_dist:
+                    next_article = link
+                    min_dist = dist
+
+            if next_article == "":
+                raise PathNotFoundException(f"GreedySearch: could not find path, current: {ret}")
+
+            ret.append(next_article)
+            cur = next_article
+
+        raise MaxIterationsException(f"GreedySearch: Max iterations {self.max_iterations} reached, current path: {ret}")
+
+
+class BeamSearch:
+    def __init__(self, embedding_provider, graph_provider, max_iterations=20, width=10):
+        self.embeddings = embedding_provider
+        self.graph = graph_provider
+        self.max_iterations = max_iterations
+        self.width = width
+
+    def _get_path(self, end, parent):
+        ret = []
+        cur = end
+        while(parent[cur] != cur):
+            ret.append(cur)
+            cur = parent[cur]
+
+        ret.append(cur)
+        return list(reversed(ret))
+
+
+    def search(self, start: str, end: str):
+        # Define distance metric
+        # TODO customizable
+        end_v = self.embeddings.get_embedding(end)
+        def get_dist(article):
+            try: 
+                cur_v = self.embeddings.get_embedding(link)
+            except KeyError:    
+                return 100
+            return distance.cosine(cur_v, end_v)
+
+        # Greedily searches the wikipedia graph
+        cur_set = [start]
+        # Keeps track of parent articles, also serves as visitor set
+        parent = {start: start}
+
+        for i in range(self.max_iterations):
+            next_set = []
+            for article in cur_set:
+                outgoing = self.graph.get_links(article)
+                for link in outgoing:
+                    if link in parent:
+                        continue
+                    parent[link] = article
+                    next_set.append((get_dist(link), link))
+
+                    if link == end:
+                        return self._get_path(link, parent)
+
+            cur_set = [article for (_, article) in sorted(next_set)]
+            cur_set = cur_set[:self.width]
+            print(f"Articles in iteration {i}: ", cur_set)
+
+        raise MaxIterationsException(f"BeamSearch: Max iterations {self.max_iterations} reached")
+
+# TODO probabilistic search (for random results)
+# TODO other heuristics