Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding Opengraph.io API to scrape websites and read/infer open graph tags. #120

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion crewai_tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,15 @@
FirecrawlCrawlWebsiteTool,
FirecrawlScrapeWebsiteTool,
FirecrawlSearchTool,
GetOpengraphTagsTool,
GithubSearchTool,
JSONSearchTool,
LlamaIndexTool,
MDXSearchTool,
MultiOnTool,
MySQLSearchTool,
NL2SQLTool,
OpenGraphScrapeWebsiteTool,
PDFSearchTool,
PGSearchTool,
RagTool,
Expand All @@ -40,6 +43,5 @@
XMLSearchTool,
YoutubeChannelSearchTool,
YoutubeVideoSearchTool,
MySQLSearchTool
)
from .tools.base_tool import BaseTool, Tool, tool
18 changes: 12 additions & 6 deletions crewai_tools/tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,27 +11,34 @@
from .file_read_tool.file_read_tool import FileReadTool
from .file_writer_tool.file_writer_tool import FileWriterTool
from .firecrawl_crawl_website_tool.firecrawl_crawl_website_tool import (
FirecrawlCrawlWebsiteTool
FirecrawlCrawlWebsiteTool,
)
from .firecrawl_scrape_website_tool.firecrawl_scrape_website_tool import (
FirecrawlScrapeWebsiteTool
FirecrawlScrapeWebsiteTool,
)
from .firecrawl_search_tool.firecrawl_search_tool import FirecrawlSearchTool
from .github_search_tool.github_search_tool import GithubSearchTool
from .json_search_tool.json_search_tool import JSONSearchTool
from .llamaindex_tool.llamaindex_tool import LlamaIndexTool
from .mdx_seach_tool.mdx_search_tool import MDXSearchTool
from .multion_tool.multion_tool import MultiOnTool
from .mysql_search_tool.mysql_search_tool import MySQLSearchTool
from .nl2sql.nl2sql_tool import NL2SQLTool
from .opengraphio_get_opengraph_tags_tool.opengraphio_get_opengraph_tags_tool import (
GetOpengraphTagsTool,
)
from .opengraphio_scrape_website_tool.opengraphio_scrape_website_tool import (
OpenGraphScrapeWebsiteTool,
)
from .pdf_search_tool.pdf_search_tool import PDFSearchTool
from .pg_seach_tool.pg_search_tool import PGSearchTool
from .rag.rag_tool import RagTool
from .scrape_element_from_website.scrape_element_from_website import (
ScrapeElementFromWebsiteTool
ScrapeElementFromWebsiteTool,
)
from .scrape_website_tool.scrape_website_tool import ScrapeWebsiteTool
from .scrapfly_scrape_website_tool.scrapfly_scrape_website_tool import (
ScrapflyScrapeWebsiteTool
ScrapflyScrapeWebsiteTool,
)
from .selenium_scraping_tool.selenium_scraping_tool import SeleniumScrapingTool
from .serper_dev_tool.serper_dev_tool import SerperDevTool
Expand All @@ -46,7 +53,6 @@
from .website_search.website_search_tool import WebsiteSearchTool
from .xml_search_tool.xml_search_tool import XMLSearchTool
from .youtube_channel_search_tool.youtube_channel_search_tool import (
YoutubeChannelSearchTool
YoutubeChannelSearchTool,
)
from .youtube_video_search_tool.youtube_video_search_tool import YoutubeVideoSearchTool
from .mysql_search_tool.mysql_search_tool import MySQLSearchTool
120 changes: 120 additions & 0 deletions crewai_tools/tools/opengraphio_get_opengraph_tags_tool/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
# OpenGraphIOGetOpenGraphTagsTool

## Description

The `OpenGraphIOGetOpenGraphTagsTool` is a tool for retrieving OpenGraph tags from websites using the OpenGraph.io API.
It extracts key OpenGraph metadata, such as titles, descriptions, and images from webpages, allowing users to gather
insights about any given URL. In addition to the tags found on the site, the Opengraph.io API will infer values
that may be missing from the page.

## Installation

To use the `OpenGraphIOGetOpenGraphTagsTool`, you need to install the `crewai[tools]` package:

```sh
pip install crewai[tools]
```

## Example

```python
# To run the example, you will need to make sure you have your API keys set.
# 1. create a free account on https://opengraph.io/
# 2. set the OPENGRAPHIO_API_KEY environment variable to your API key
# 3. run the example

from crewai_tools.tools.opengraphio_get_opengraph_tags_tool.opengraphio_get_opengraph_tags_tool import GetOpengraphTagsTool
from crewai import Agent, Task, Crew

# Create an instance of the OpenGraphTool
opengraph_tags_tool = GetOpengraphTagsTool()

# Create the agent with the OpenGraphTool
opengraph_specialist = Agent(
role="Open Graph Metadata Specialist",
goal="Suggest most relevant Open Graph metadata tags for a website",
backstory="A skilled SEO / SEM consultant with 20 years of experience.",
tools=[opengraph_tags_tool],
verbose=True,
cache=False
)

# Define the tasks for the agent
suggest_opengraph_tags = Task(
description="Review the OpenGraph metadata and the tags suggested from the Opengraph.io API for "
"https://www.wunderground.com/ and suggest the most relevant Open Graph metadata tags. "
"The Opengraph.io API will return the following important properties:"
"- hybridGraph - The tags that the Opengraph.io API suggests for the page"
"- openGraph - The tags that are currently on the page",
expected_output="Provide the tags that are currently on the page ('openGraph' property) and suggest HTML to be "
"inserted into the <HEAD> tag to provide more effective tags for sharing on social websites. "
"The response should look like this:"
"## Current Tags"
"You're assessment of the current tags"
"## Suggested Tags"
"You're suggested HTML content to add to the <HEAD> tag"
"### Explanation"
"Explain why you suggest these tags",
agent=opengraph_specialist
)


# Create a crew with the agent and tasks
crew = Crew(
agents=[opengraph_specialist],
tasks=[
suggest_opengraph_tags
],
verbose=True
)

# Kick off the crew to execute the tasks
crew.kickoff()

```
### Output
```bash
# Agent: Open Graph Metadata Specialist
## Final Answer:
## Current Tags
The current Open Graph tags were not found on the page; however, there are inferred tags based on the content extracted:
- Title: Local Weather Forecast, News and Conditions | Weather Underground
- Description: Weather Underground provides local & long-range weather forecasts, weather reports, maps & tropical weather conditions for locations worldwide
- Type: site
- URL: https://www.wunderground.com/
- Site Name: Local Weather Forecast, News and Conditions
- Image: https://www.wunderground.com/static/i/misc/twc-white.svg

## Suggested Tags
To enhance social sharing, I suggest adding the following HTML content to the `<HEAD>` tag:
\`\`\`html
<meta property="og:title" content="Local Weather Forecast, News and Conditions | Weather Underground" />
<meta property="og:description" content="Weather Underground provides local & long-range weather forecasts, weather reports, maps & tropical weather conditions for locations worldwide" />
<meta property="og:type" content="website" />
<meta property="og:url" content="https://www.wunderground.com/" />
<meta property="og:site_name" content="Weather Underground" />
<meta property="og:image" content="https://www.wunderground.com/static/i/misc/twc-white.svg" />
<meta property="og:image:alt" content="Weather Underground Logo" />
\`\`\`

### Explanation
I suggest these tags because they provide essential metadata for social platforms to display rich previews when the link to Weather Underground is shared. Including a specific image (`og:image`) enhances the visual appeal, while a clear and concise title and description (`og:title` and `og:description`) can help engage users and improve click-through rates. These elements ensure that the page is represented accurately and attractively on social media, which is crucial for driving traffic and improving user engagement.

```

## Arguments
- `url` (string): The webpage URL to scrape.
- `full_render` (bool, optional): Whether to fully render the page before extracting metadata.
- `max_cache_age` (int, optional): The maximum cache age in milliseconds.
- `use_proxy` (bool, optional): Whether to use a proxy for scraping.
- `use_premium` (bool, optional): Whether to use the Premium Proxy feature.
- `use_superior` (bool, optional): Whether to use the Superior Proxy feature.
- `auto_proxy` (bool, optional): Whether to automatically use a proxy for domains that require one.
- `cache_ok` (bool, optional): Whether to allow cached responses.
- `accept_lang` (string, optional): The request language sent when requesting the URL.
- `ignore_scrape_failures` (bool, optional): Whether to ignore failures.

## API Key
To use the OpenGraph.io API, you need to create a free account on [https://opengraph.io](https://opengraph.io) and set
the OPENGRAPHIO_API_KEY environment variable to your API key.

44 changes: 44 additions & 0 deletions crewai_tools/tools/opengraphio_get_opengraph_tags_tool/example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from crewai import Agent, Crew, Task

from crewai_tools.tools.opengraphio_get_opengraph_tags_tool.opengraphio_get_opengraph_tags_tool import (
GetOpengraphTagsTool,
)

# Create an instance of the OpenGraphTool
opengraph_tags_tool = GetOpengraphTagsTool()

# Create the agent with the OpenGraphTool
opengraph_specialist = Agent(
role="Open Graph Metadata Specialist",
goal="Suggest most relevant Open Graph metadata tags for a website",
backstory="A skilled SEO / SEM consultant with 20 years of experience.",
tools=[opengraph_tags_tool],
verbose=True,
cache=False,
)

# Define the tasks for the agent
suggest_opengraph_tags = Task(
description="Review the OpenGraph metadata and the tags suggested from the Opengraph.io API for "
"https://www.wunderground.com/ and suggest the most relevant Open Graph metadata tags. "
"The Opengraph.io API will return the following important properties:"
"- hybridGraph - The tags that the Opengraph.io API suggests for the page"
"- openGraph - The tags that are currently on the page",
expected_output="Provide the tags that are currently on the page ('openGraph' property) and suggest HTML to be "
"inserted into the <HEAD> tag to provide more effective tags for sharing on social websites. "
"The response should look like this:"
"## Current Tags"
"You're assessment of the current tags"
"## Suggested Tags"
"You're suggested HTML content to add to the <HEAD> tag"
"### Explanation"
"Explain why you suggest these tags",
agent=opengraph_specialist,
)


# Create a crew with the agent and tasks
crew = Crew(agents=[opengraph_specialist], tasks=[suggest_opengraph_tags], verbose=True)

# Kick off the crew to execute the tasks
crew.kickoff()
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import logging
import os
from typing import Optional, Type

from pydantic import BaseModel, Field

from crewai_tools.tools.base_tool import BaseTool

logger = logging.getLogger(__file__)


class GetOpengraphTagsToolSchema(BaseModel):
url: str = Field(description="Webpage URL")
cache_ok: Optional[bool] = Field(
default=None, description="Whether to allow cached responses"
)
full_render: Optional[bool] = Field(
default=None,
description="Whether to fully render the page before extracting metadata",
)
use_proxy: Optional[bool] = Field(
default=None, description="Whether to use a proxy for scraping"
)
use_premium: Optional[bool] = Field(
default=None, description="Whether to use the Premium Proxy feature"
)
use_superior: Optional[bool] = Field(
default=None, description="Whether to use the Superior Proxy feature"
)
auto_proxy: Optional[bool] = Field(
default=None,
description="Whether to automatically use a proxy for domains that require one",
)
max_cache_age: Optional[int] = Field(
default=None, description="The maximum cache age in milliseconds"
)
accept_lang: Optional[str] = Field(
default=None, description="The request language sent when requesting the URL"
)
ignore_scrape_failures: Optional[bool] = Field(
default=None, description="Whether to ignore failures"
)


class GetOpengraphTagsTool(BaseTool):
name: str = "OpenGraph.io tags extraction tool"
description: str = "Extract OpenGraph tags from a webpage URL using OpenGraph.io"
args_schema: Type[BaseModel] = GetOpengraphTagsToolSchema
api_key: str = None

def __init__(self, api_key: Optional[str] = None):
super().__init__()
self.api_key = api_key or os.getenv("OPENGRAPHIO_API_KEY")

def _run(
self,
url: str,
cache_ok: Optional[bool] = None,
full_render: Optional[bool] = None,
use_proxy: Optional[bool] = None,
use_premium: Optional[bool] = None,
use_superior: Optional[bool] = None,
auto_proxy: Optional[bool] = None,
max_cache_age: Optional[int] = None,
accept_lang: Optional[str] = None,
ignore_scrape_failures: Optional[bool] = None,
):
import urllib.parse

import requests

encoded_url = urllib.parse.quote_plus(url)
api_endpoint = f"https://opengraph.io/api/1.1/site/{encoded_url}"
params = {"app_id": self.api_key}

if cache_ok is not None:
params["cache_ok"] = cache_ok
if full_render is not None:
params["full_render"] = full_render
if use_proxy is not None:
params["use_proxy"] = use_proxy
if use_premium is not None:
params["use_premium"] = use_premium
if use_superior is not None:
params["use_superior"] = use_superior
if auto_proxy is not None:
params["auto_proxy"] = auto_proxy
if max_cache_age is not None:
params["max_cache_age"] = max_cache_age
if accept_lang is not None:
params["accept_lang"] = accept_lang

try:
response = requests.get(api_endpoint, params=params)
response.raise_for_status()
data = response.json()
return data
except requests.RequestException as e:
if ignore_scrape_failures:
logger.error(
f"Error fetching OpenGraph tags from {url}, exception: {e}"
)
return None
else:
raise e
Loading