From 6810c953b19cebff681181c491c3872cd8132ec6 Mon Sep 17 00:00:00 2001 From: Dark_Nemesis Date: Sun, 16 Feb 2025 19:01:16 +0530 Subject: [PATCH] Fix: Handle Missing Page Titles This commit fixes these issues by: Using .get('') to safely retrieve the page title, providing a default empty string if the element is missing. Using .strip() to remove leading/trailing whitespace from the extracted title. Using page_title or None to explicitly set the title to None if it's missing or empty after stripping whitespace. --- rocket_chat_docs_spider/rcspider.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/rocket_chat_docs_spider/rcspider.py b/rocket_chat_docs_spider/rcspider.py index a437040..59ae432 100644 --- a/rocket_chat_docs_spider/rcspider.py +++ b/rocket_chat_docs_spider/rcspider.py @@ -49,12 +49,12 @@ def parse(self, response): self.visited_urls.add(link) yield scrapy.Request(response.urljoin(link), callback=self.parse) - content = response.css('.content_block div p::text').getall() - h2_headers = response.css('.content_block div h2::text').getall() - page_title = response.css('.content_block div h1::text').get() + content = [c.strip() for c in response.css('.content_block div p::text').getall() if c.strip()] + h2_headers = [h.strip() for h in response.css('.content_block div h2::text').getall() if h.strip()] + page_title = response.css('.content_block div h1::text').get('').strip() yield { - "page_title": page_title, + "page_title": page_title or None, "content": content, "h2_headers": h2_headers, "url": response.url