Skip to content

Commit

Permalink
Merge pull request #53 from DevCaptainOne/queue
Browse files Browse the repository at this point in the history
- Remove duplicates by storing the URL of the RSS articles and prevent publishing the same article twice.
- Remote HTML tags and entities from the title.
  • Loading branch information
milanmdev authored Jan 11, 2024
2 parents af4374a + 2687511 commit a6aa284
Show file tree
Hide file tree
Showing 9 changed files with 160 additions and 54 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -131,4 +131,7 @@ dist

data/last.txt
data/config.json
data/persist.json
data/persist.json

package-lock.json
docker-compose.yml
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,9 @@ Here's an example of the `config.json` file:
"forceDescriptionEmbed": false,
"descriptionClearHTML": false,
"ogUserAgent": "",
"imageAlt": "$title"
"imageAlt": "$title",
"removeDuplicate": false,
"titleClearHTML": false
}
```

Expand All @@ -99,6 +101,8 @@ Here's an example of the `config.json` file:
- `descriptionClearHTML`: Remove HTML from the description of the Open Graph description and RSS-provided description (to make it more readable).
- `ogUserAgent`: The user agent to use when fetching the Open Graph data of the URL provided by the RSS post. By default, this is set to `bsky.rss/1.0 (Open Graph Scraper)`.
- `imageAlt`: Alt text for the uploaded image if the `embedType` is set to `image`. Can be any variable (+ string) used in the `string` configuration (e.g. `$title`).
- `removeDuplicate`: Instead of using the last date to track which items needs to be published, use a text-based database to track duplicate items.
- `titleClearHTML`: Remove HTML from the title of the post (to make it more readable).

A `docker-compose.yml` file can be found in the root directory as `docker-compose.example.yml`, which you can use to set up the RSS poster using Docker.

Expand Down
2 changes: 2 additions & 0 deletions app/types/index.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ interface Config {
descriptionClearHTML?: boolean;
forceDescriptionEmbed?: boolean;
imageAlt?: string;
removeDuplicate?: boolean;
titleClearHTML?: boolean;
}

interface Item {
Expand Down
56 changes: 56 additions & 0 deletions app/utils/dbHandler.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import fs from "fs";

let appConfig: any = null;

async function readLast() {
Expand Down Expand Up @@ -62,11 +63,66 @@ async function readConfig() {
return JSON.parse(appConfig);
}

async function valueExists(value: string) {
if (!fs.existsSync(__dirname + "/../../data/db.txt")) {
fs.writeFileSync(__dirname + "/../../data/db.txt", "", "utf8");
return false;
} else {
let fileContent = fs.readFileSync(__dirname + "/../../data/db.txt", "utf8");
return (fileContent.includes(value));
}
}

async function writeValue(value: string) {
let currentDate = new Date();
fs.appendFileSync(
__dirname + "/../../data/db.txt",
currentDate.toISOString() + "|" + value + "\n",
"utf8"
);
return value;
}

// Automatically cleanup old values from the file after 96 hours
async function cleanupOldValues() {
let currentDate = new Date();
let oldFileContent = fs.readFileSync(__dirname + "/../../data/db.txt", "utf8");
let newFileContent = "";

let fcLines: string[] = oldFileContent.split("\n")
if (fcLines != undefined) {
for (var i in fcLines) {
let lineItems: string[] = (fcLines[i] || "").split("|");
if (lineItems != undefined) {
let lineDate = new Date((lineItems[0] || "").toString());
let diffHours = getHoursDiffBetweenDates(lineDate, currentDate);

if (diffHours <= 96) {
newFileContent = newFileContent + (fcLines[i] || "") + "\n";
}
}
}
}

fs.writeFileSync(
__dirname + "/../../data/db.txt",
newFileContent,
"utf8"
);
return true;
}

const getHoursDiffBetweenDates = (dateInitial: Date, dateFinal: Date) =>
(dateFinal.getTime() - dateInitial.getTime()) / (1000 * 3600);

export default {
readLast,
writeDate,
readConfig,
initConfig,
writePersistDate,
readPersistData,
valueExists,
writeValue,
cleanupOldValues,
};
3 changes: 3 additions & 0 deletions app/utils/queueHandler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ let config: Config = {
descriptionClearHTML: true,
forceDescriptionEmbed: false,
imageAlt: "",
removeDuplicate: false,
titleClearHTML: false
};

async function start() {
Expand Down Expand Up @@ -92,6 +94,7 @@ async function runQueue() {
config.runInterval
} seconds`
);
db.cleanupOldValues();
}
}
}
Expand Down
38 changes: 34 additions & 4 deletions app/utils/rssHandler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ import axios from "axios";
import queue from "./queueHandler";
import db from "./dbHandler";
import og from "open-graph-scraper";
import {decode} from 'html-entities';

let reader: any = null;
let lastDate: string = "";

Expand All @@ -18,6 +20,8 @@ let config: Config = {
ogUserAgent: "bsky.rss/1.0 (Open Graph Scraper)",
descriptionClearHTML: true,
forceDescriptionEmbed: false,
removeDuplicate: false,
titleClearHTML: false
};

async function start() {
Expand All @@ -33,10 +37,9 @@ async function start() {
if (!useDate)
return console.log("No date provided by RSS reader for post.");

if (new Date(useDate) <= new Date(lastDate)) return;

let parsed = parseString(config.string, item, config.truncate == true);
let embed: Embed | undefined = undefined;
let title: string | undefined = undefined;

if (config.publishEmbed) {
if (!item.link)
Expand All @@ -47,6 +50,13 @@ async function start() {
if (typeof item.link === "object") url = item.link.href;
else url = item.link;

if (config.removeDuplicate){
if (await db.valueExists(url)) return;
else await db.writeValue(url);
} else {
if (new Date(useDate) <= new Date(lastDate)) return;
}

let image: Buffer | undefined = undefined;
let description: string | undefined = undefined;
let imageAlt: string | undefined = undefined;
Expand Down Expand Up @@ -186,9 +196,17 @@ async function start() {
}
}

if (new Date(useDate) <= new Date(lastDate)) return;

title = item.title;

if (title && config.titleClearHTML) {
title = decodeHTML(removeHTMLTags(title));
}

await queue.writeQueue({
content: parsed.text,
title: item.title,
title: title,
embed: config.publishEmbed ? embed : undefined,
languages: config.languages ? config.languages : undefined,
date: useDate,
Expand Down Expand Up @@ -235,7 +253,13 @@ function parseString(string: string, item: Item, truncate: boolean) {
let parsedString = string;
if (string.includes("$title")) {
if (!item.title) throw new Error("No title provided from RSS reader.");
parsedString = parsedString.replace("$title", item.title);

if (config.titleClearHTML) {
parsedString = parsedString.replace("$title", decodeHTML(removeHTMLTags(item.title)));
}
else {
parsedString = parsedString.replace("$title", item.title);
}
}

if (string.includes("$link")) {
Expand Down Expand Up @@ -294,3 +318,9 @@ function removeHTMLTags(htmlString: string) {
.trim()
.replace(/ +/g, " ");
}

function decodeHTML(htmlString: string) {
// From my tests, some HTML strings needs to be double-decoded.
// Ex.: &amp;#233; -> &#233; -> é
return decode(decode(htmlString));
}
4 changes: 3 additions & 1 deletion data/config.example.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,7 @@
"forceDescriptionEmbed": false,
"descriptionClearHTML": false,
"ogUserAgent": "",
"imageAlt": "$title"
"imageAlt": "$title",
"removeDuplicate": false,
"titleClearHTML": false
}
3 changes: 2 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
"dotenv": "^16.3.1",
"feedsub": "^0.7.8",
"open-graph-scraper": "6.3.2",
"sharp": "^0.32.4"
"sharp": "^0.32.4",
"html-entities": "2.4.0"
},
"devDependencies": {
"@types/node": "20.9.0",
Expand Down
Loading

0 comments on commit a6aa284

Please sign in to comment.