From dadac60128d162372388b1c4e1b5519ffdab47e9 Mon Sep 17 00:00:00 2001 From: Marcin Kwiatkowski Date: Fri, 7 Nov 2025 09:19:18 +0100 Subject: [PATCH] fix: fixed feed generation --- Utils/RSSfeed/feed.xml.js | 102 +++++++++++++++++++++++--------------- 1 file changed, 62 insertions(+), 40 deletions(-) diff --git a/Utils/RSSfeed/feed.xml.js b/Utils/RSSfeed/feed.xml.js index 49df43d..58fd714 100644 --- a/Utils/RSSfeed/feed.xml.js +++ b/Utils/RSSfeed/feed.xml.js @@ -67,7 +67,7 @@ const baseConfig = { language: "en", ttl: 60, }, - contentDir: path.join(__dirname, '..', '..'), + contentDir: path.join(__dirname, "..", ".."), excludeDirs: [ "node_modules", ".git", @@ -344,6 +344,49 @@ function generateRSSFeed(config) { console.log(`✓ Feed saved to ${path.basename(config.outputPath)}\n`); } +/** + * Check if a URL looks valid (not containing array notation or other invalid patterns) + * @param {string} url - URL to validate + * @returns {boolean} True if URL looks valid + */ +function isValidUrl(url) { + // Extract the path part after the domain + const pathMatch = url.match(/https:\/\/frodigo\.com\/(.+)$/); + if (!pathMatch) { + return false; + } + const path = pathMatch[1]; + + // Filter out URLs that are just numbers (likely extracted from array notation like [10, 20, 30]) + if (/^\d+$/.test(path)) { + return false; + } + + // Filter out URLs that contain array notation patterns + if (/\[.*?\]/.test(url)) { + return false; + } + // Filter out URLs that start with numbers followed by commas (array elements) + if (/\/\d+,\+/.test(url)) { + return false; + } + // Filter out URLs ending with array-like patterns + if (/,\+\d+[,\]]/.test(url)) { + return false; + } + // Filter out other invalid patterns + if ( + url.includes("...") || + url.includes("undefined") || + url.includes("link") || + url.includes("Wiki") || + url.includes(""") + ) { + return false; + } + return true; +} + /** * Extract links from RSS feed XML content * @param {string} feedContent - RSS feed XML content @@ -351,11 +394,19 @@ function generateRSSFeed(config) { */ function extractLinksFromFeed(feedContent) { const allLinks = new Set(); - const urlRegex = /https:\/\/frodigo\.com\/[^"\s<>]+/g; + // More restrictive regex: stop at brackets, parentheses, and other invalid URL characters + // Also stop at common punctuation that shouldn't be in URLs + const urlRegex = /https:\/\/frodigo\.com\/[^"\s<>\[\](){}|\\^`]+/g; // First, remove code blocks to avoid processing links within them - const contentWithoutCodeBlocks = feedContent.replace( - /
[\s\S]*?<\/code><\/pre>/g,
+  // Also remove inline code with backticks
+  let contentWithoutCodeBlocks = feedContent.replace(
+    /
[\s\S]*?<\/code><\/pre>/gi,
+    ""
+  );
+  // Remove inline code elements
+  contentWithoutCodeBlocks = contentWithoutCodeBlocks.replace(
+    /[\s\S]*?<\/code>/gi,
     ""
   );
 
@@ -366,14 +417,8 @@ function extractLinksFromFeed(feedContent) {
   contentMatches.forEach((match) => {
     const contentUrls = match[1].match(urlRegex) || [];
     contentUrls.forEach((url) => {
-      // Filter out invalid links and links that look like code snippets
-      if (
-        !url.includes("...") &&
-        !url.includes("undefined") &&
-        !url.includes("link") &&
-        !url.includes("Wiki") &&
-        !url.includes(""")
-      ) {
+      // Filter out invalid links using the validation function
+      if (isValidUrl(url)) {
         allLinks.add(url);
       }
     });
@@ -384,13 +429,7 @@ function extractLinksFromFeed(feedContent) {
   const linkMatches = [...contentWithoutCodeBlocks.matchAll(linkRegex)];
   linkMatches.forEach((match) => {
     const url = match[1];
-    if (
-      !url.includes("...") &&
-      !url.includes("undefined") &&
-      !url.includes("link") &&
-      !url.includes("Wiki") &&
-      !url.includes(""")
-    ) {
+    if (isValidUrl(url)) {
       allLinks.add(url);
     }
   });
@@ -400,13 +439,7 @@ function extractLinksFromFeed(feedContent) {
   const guidMatches = [...contentWithoutCodeBlocks.matchAll(guidRegex)];
   guidMatches.forEach((match) => {
     const url = match[1];
-    if (
-      !url.includes("...") &&
-      !url.includes("undefined") &&
-      !url.includes("link") &&
-      !url.includes("Wiki") &&
-      !url.includes(""")
-    ) {
+    if (isValidUrl(url)) {
       allLinks.add(url);
     }
   });
@@ -417,13 +450,7 @@ function extractLinksFromFeed(feedContent) {
   descMatches.forEach((match) => {
     const descUrls = match[1].match(urlRegex) || [];
     descUrls.forEach((url) => {
-      if (
-        !url.includes("...") &&
-        !url.includes("undefined") &&
-        !url.includes("link") &&
-        !url.includes("Wiki") &&
-        !url.includes(""")
-      ) {
+      if (isValidUrl(url)) {
         allLinks.add(url);
       }
     });
@@ -435,13 +462,7 @@ function extractLinksFromFeed(feedContent) {
   const imageMatches = [...contentWithoutCodeBlocks.matchAll(imageRegex)];
   imageMatches.forEach((match) => {
     const url = match[1];
-    if (
-      !url.includes("...") &&
-      !url.includes("undefined") &&
-      !url.includes("link") &&
-      !url.includes("Wiki") &&
-      !url.includes(""")
-    ) {
+    if (isValidUrl(url)) {
       allLinks.add(url);
     }
   });
@@ -510,4 +531,5 @@ module.exports = {
   createFeedItem,
   generateRSSFeed,
   extractLinksFromFeed,
+  isValidUrl,
 };