Merge pull request #16 from Martin-lc/dev-donglin-t2

Dev donglin t2
Martin-lc · Nov 30, 2023 · 8d28cb4 · 8d28cb4
2 parents 2e15c0c + 2e2482d
commit 8d28cb4
Show file tree

Hide file tree

Showing 4 changed files with 234 additions and 78 deletions.
diff --git a/content_ranker/README.md b/content_ranker/README.md
@@ -1,6 +1,6 @@
 # Content Ranking
 
-This part of module will rank all the summarized contents and return the 1st content to the client. The content_ranker.js module is designed to rank user-generated content based on user preferences. It interfaces with the OpenAI API to generate related words for user preferences and then evaluates and ranks content based on the frequency of these words.
+This part of module will rank all the summarized contents and return the 1st content to the client. The content_ranker.js module is designed to rank user-generated content based on user preferences. It interfaces with the OpenAI API to generate related words for user preferences and then evaluates and ranks content based on the frequency of these words and the semantic relation beween the words and the contents.
 
 ## Dependencies
 
@@ -9,6 +9,40 @@ sqlite3: Used for database operations, specifically fetching user data from a da
 
 ## Functions
 
+### getEmbeddings(text)
+Parameters:
+
+text (String): The text for which embeddings are to be fetched.
+Description:
+Fetches embeddings for a given text using the OpenAI API. This is useful for advanced content analysis and similarity calculations.
+
+Returns:
+A Promise that resolves to an Array of embeddings for the given text.
+
+### cosineSimilarity(vecA, vecB)
+Parameters:
+
+vecA (Array): The first vector.
+vecB (Array): The second vector.
+Description:
+Calculates the cosine similarity between two vectors. This is a measure of similarity between two non-zero vectors of an inner product space that measures the cosine of the angle between them.
+
+Returns:
+The cosine similarity (a number) between the two vectors.
+
+### combinedScore(content, preferencesVector, generatedWords)
+
+Parameters:
+
+content (String): The content to score.
+preferencesVector (Array): The preferences vector obtained from embeddings.
+generatedWords (Array): An array of words generated based on user preferences.
+Description:
+Calculates a combined score for a piece of content based on the cosine similarity between the content's embeddings and the preferences vector, as well as the frequency of generated words in the content.
+
+Returns:
+A Promise that resolves to a number representing the combined relevance score of the content.
+
 ### generateRelatedWords(preferences)
 
 Parameters:
@@ -44,30 +78,30 @@ Scores a piece of content based on the frequency of generated words related to u
 Returns:
 A score representing the relevance of the content to the user preferences.
 
-### rankContents(contents, generatedWords)
+
+### rankContents(contents, preferencesVector, generatedWords)
 
 Parameters:
+
 contents (Array): An array of content strings to be ranked.
+preferencesVector (Array): The vector representing user preferences.
 generatedWords (Array): An array of words related to user preferences.
-
 Description:
-Ranks each content in the contents array based on its score, which is computed using the scoreContent function.
+Extended to rank each piece of content based on a combined score, which includes both the relevance to the user preferences (as determined by cosine similarity) and the frequency of generated words within the content.
 
 Returns:
-An array of content objects sorted by score in descending order. Each object contains the content string and its associated score.
-
-### getTopContent(userId, preferences, contents)
+A Promise that resolves to an array of content objects sorted by their combined score in descending order.
 
+### getTopContent(preferences, contents)
 Parameters:
-userId (String): The ID of the user.
+
 preferences (String): The user's preferences.
 contents (Array): An array of content strings to be evaluated and ranked.
-
 Description:
-Generates related words for the user's preferences, scores each content based on these words, ranks the contents, and returns the highest-scoring content.
+Generates related words and a preferences vector for the user's preferences, then scores and ranks each piece of content based on these elements, ultimately returning the highest-scoring content.
 
 Returns:
-The content string with the highest score.
+A Promise that resolves to the content string with the highest combined score.
 
 ## Usage
 

diff --git a/content_ranker/content_ranker.js b/content_ranker/content_ranker.js
@@ -1,35 +1,87 @@
 const OpenAI = require("langchain/llms/openai").OpenAI;
 const PromptTemplate = require("langchain/prompts").PromptTemplate;
+const OpenAIEmbeddings=require("langchain/embeddings/openai").OpenAIEmbeddings;
+const math = require('mathjs');
+
+/* Create instance for embeddings */
+const embeddings = new OpenAIEmbeddings();
+
+/* Create instance for using OpenAI's language model */
 const llm = new OpenAI({
     openAIApiKey: process.env.OPENAI_API_KEY,
 });
 
+/**
+ * Fetches embeddings for a given text.
+ * @param {string} text - The text to get embeddings for.
+ * @returns {Promise<Array>} A promise that resolves to an array of embeddings.
+ */
+async function getEmbeddings(text) {
+    try {
+        const response = await embeddings.embedQuery(text);
+        return response;
+    } catch (error) {
+        console.error('Error fetching embeddings:', error);
+        throw error;
+    }
+}
 
+/**
+ * Calculates the cosine similarity between two vectors.
+ * @param {Array} vecA - The first vector.
+ * @param {Array} vecB - The second vector.
+ * @returns {number} The cosine similarity between the two vectors.
+ */
+function cosineSimilarity(vecA, vecB) {
+    return math.dot(vecA, vecB) / (math.norm(vecA) * math.norm(vecB));
+}
 
-async function generateRelatedWords(preferences) {
-    const prompt = PromptTemplate.fromTemplate("I want to ranking the content by the user preferences, \
-  so I will give you 3 preferences words from a user and you generate 10 related words for each \
-  preference word and i will count them in the content to do the count.Return the generated 30 words without the original words\
-  without any other text and comma-separated. Here is the preferences words:{preference}");
-
-    const formattedPrompt = await prompt.format({
-        preference: preferences,
-    });
-
-    const llmResult = await llm.predict(formattedPrompt);
-
-    // Extract the generated words from the llmResult. 
-    // This step assumes a certain structure of the llmResult which might need adjustment.
-    // Split the string into an array of words
-    const generatedWords = llmResult.split(',').map(word => word.trim());
-    console.log("generatedWords", generatedWords);
-    return {
-        originalWords: preferences.split(',').map(word => word.trim()),
-        generatedWords: generatedWords,
-    };
-
+/**
+ * Calculates a combined score based on content, preferences vector, and generated words.
+ * @param {string} content - The content to score.
+ * @param {Array} preferencesVector - The preferences vector.
+ * @param {Array} generatedWords - The generated words.
+ * @returns {Promise<number>} A promise that resolves to the combined score.
+ */
+async function combinedScore(content, preferencesVector, generatedWords) {
+    try {
+        const contentVector = await getEmbeddings(content);
+        const cosineScore = cosineSimilarity(preferencesVector, contentVector);
+        const originalScore = scoreContent(generatedWords, content);
+        return (cosineScore * 0.5 + originalScore * 0.5) * 100;
+    } catch (error) {
+        console.error('Error calculating combined score:', error);
+        throw error;
+    }
 }
 
+/**
+ * Generates related words based on given preferences.
+ * @param {string} preferences - The preferences words.
+ * @returns {Promise<Object>} A promise that resolves to an object containing original and generated words.
+ */
+async function generateRelatedWords(preferences) {
+    try {
+        const prompt = PromptTemplate.fromTemplate("I want to ranking the content by the user preferences, so I will give you 3 preferences words from a user and you generate 10 related words for each preference word and i will count them in the content to do the count.Return the generated 30 words without the original words without any other text and comma-separated. Here are the preferences words:{preference}");
+        const formattedPrompt = await prompt.format({preference: preferences});
+        const llmResult = await llm.predict(formattedPrompt);
+        const generatedWords = llmResult.split(',').map(word => word.trim());
+        console.log("generatedWords", generatedWords);
+        return {
+            originalWords: preferences.split(',').map(word => word.trim()),
+            generatedWords: generatedWords,
+        };
+    } catch (error) {
+        console.error('Error generating related words:', error);
+        throw error;
+    }
+}
+/**
+ * Counts the number of occurrences of a specific word in a given content.
+ * @param {string} word - The word to count occurrences of.
+ * @param {string} content - The content to search within.
+ * @returns {number} The number of times the word occurs in the content.
+ */
 function countWordOccurrences(word, content) {
     const words = content.split(/[\s,]+/);
     let count = 0;
@@ -38,47 +90,96 @@ function countWordOccurrences(word, content) {
     });
     return count;
 }
-// Function to score a content based on word occurrences/length
+/**
+ * Scores a piece of content based on the frequency of generated words.
+ * The score is the sum of occurrences of each generated word divided by the total word count.
+ * @param {Array} generatedWords - An array of words generated based on user preferences.
+ * @param {string} content - The content to be scored.
+ * @returns {number} The score of the content based on word frequency.
+ */
 function scoreContent(generatedWords, content) {
     let score = 0;
     generatedWords.forEach(word => {
         score += countWordOccurrences(word, content);
     });
-    const firstThreeWords = content.split(' ').slice(0, 3).join(' ');
+    //const firstFiveWords = content.split(' ').slice(0, 3).join(' ');
 
 
     const wordCount = content.split(' ').length; // Count of words
-    console.log(firstThreeWords, ':', score / wordCount);
+    //console.log(firstThreeWords, ':', score / wordCount);
     return score / wordCount;
 }
 
-// Function to rank contents based on their scores
-function rankContents(contents, generatedWords) {
-    const scoredContents = contents.map(content => ({
-        content: content,
-        score: scoreContent(generatedWords, content)
+/**
+ * Ranks a list of contents based on a combined score.
+ * @param {Array} contents - An array of contents to be ranked.
+ * @param {Array} preferencesVector - The vector representing user preferences.
+ * @param {Array} generatedWords - Words generated related to user preferences.
+ * @returns {Promise<Array>} A promise that resolves to an array of contents sorted by their combined score.
+ */
+async function rankContents(contents, preferencesVector, generatedWords) {
+    const scoredContents = await Promise.all(contents.map(async content => {
+        const combinedScoreValue = await combinedScore(content, preferencesVector, generatedWords);
+        const firstFiveWords = content.split(' ').slice(0, 5).join(' ');
+        console.log('Content:::',firstFiveWords, ':::', 'combinedScore:', combinedScoreValue)
+        return {
+            content: content,
+            combinedScore: combinedScoreValue
+        };
     }));
 
-    // Sort contents by score in descending order
-    return scoredContents.sort((a, b) => b.score - a.score);
+    return scoredContents.sort((a, b) => b.combinedScore - a.combinedScore);
 }
-
-// Main function to get the highest-scoring content
+/**
+ * Retrieves the top content based on user preferences.
+ * @param {string} preferences - User preferences as a comma-separated string.
+ * @param {Array} contents - An array of contents to be evaluated.
+ * @returns {Promise<string>} A promise that resolves to the content with the highest score.
+ */
 async function getTopContent(preferences, contents) {
-    console.log("generating related words based on client preferences...")
+    console.log("Generating related words based on client preferences...");
     const { generatedWords } = await generateRelatedWords(preferences);
-    console.log("Done")
-    console.log("ranking the contents...")
-    const ranked = rankContents(contents, generatedWords);
-    console.log("Done")
+    console.log("Vectorizing preferences...");
+    const preferencesVector = await getEmbeddings(preferences);
+    console.log("Done");
+    console.log("Ranking the contents...");
+    const ranked = await rankContents(contents, preferencesVector, generatedWords);
+    console.log("Done");
     return ranked[0].content;  // Return the highest-scoring content
 }
 
-
 module.exports = {
+    getEmbeddings,
+    cosineSimilarity,
+    combinedScore,
     generateRelatedWords,
     countWordOccurrences,
     scoreContent,
     rankContents,
     getTopContent
 };
+// async function runTests() {
+//     console.log("Testing Embedding Query...");
+//     const sampleText = "Hello, world!";
+//     const embeddingsResult = await getEmbeddings(sampleText);
+//     console.log("Embeddings Result:", embeddingsResult);
+
+//     console.log("Testing Cosine Similarity...");
+//     const vecA = [1, 2, 3];
+//     const vecB = [4, 5, 6];
+//     const similarity = cosineSimilarity(vecA, vecB);
+//     console.log("Cosine Similarity:", similarity);
+
+//     console.log("Testing Content Scoring and Ranking...");
+//     const contents = ["This is a test content.", "Another test content here.", "Yet another piece of content."];
+//     const preferences = "test, content";
+//     const { generatedWords } = await generateRelatedWords(preferences);
+//     const rankedContents = await rankContents(contents, await getEmbeddings(preferences), generatedWords);
+//     console.log("Ranked Contents:", rankedContents);
+
+//     console.log("Testing Top Content Retrieval...");
+//     const topContent = await getTopContent(preferences, contents);
+//     console.log("Top Content:", topContent);
+// }
+
+// runTests().then(() => console.log("Tests completed.")).catch(err => console.error("Tests failed:", err));
diff --git a/package.json b/package.json
@@ -15,15 +15,18 @@
   },
   "dependencies": {
     "@huggingface/hub": "^0.10.0",
+    "axios": "^1.6.2",
     "body-parser": "^1.20.2",
     "cors": "^2.8.5",
     "express": "^4.18.2",
     "faiss-node": "^0.3.0",
     "langchain": "^0.0.167",
+    "mathjs": "^12.1.0",
     "node-fetch": "^2.7.0",
     "openai": "^4.12.1",
     "readline-sync": "^1.4.10",
     "sqlite3": "^5.1.6",
+    "word2vec": "^1.1.5",
     "xmldom": "^0.6.0",
     "yargs": "^17.7.2"
   },