Source code for py_alpaca_api.trading.news

import json
import logging
import math
import textwrap
import time

import pendulum
import yfinance as yf
from bs4 import BeautifulSoup

from py_alpaca_api.http.requests import Requests


[docs]
logger = logging.getLogger(__name__)


# Disable yfinance logging

[docs]
yfinance_logger = logging.getLogger("yfinance")

yfinance_logger.disabled = True
yfinance_logger.propagate = False



[docs]
START_DATE = pendulum.now().subtract(days=14).to_date_string()


[docs]
END_DATE = pendulum.now().to_date_string()




[docs]
class News:
    def __init__(self, headers: dict[str, str]) -> None:
        self.news_url = "https://data.alpaca.markets/v1beta1/news"
        self.headers = headers

    @staticmethod

[docs]
    def strip_html(content: str):
        """Removes HTML tags and returns the stripped content.

        Args:
            content (str): The HTML content to be stripped.

        Returns:
            str: The stripped content without HTML tags.
        """
        soup = BeautifulSoup(content, "html.parser")
        for data in soup(["style", "script"]):
            data.decompose()
        return " ".join(soup.stripped_strings)


    @staticmethod
    def _parse_date_safe(date_str: str) -> str:
        """Safely parse a date string with pendulum."""
        try:
            parsed = pendulum.parse(date_str)
            if isinstance(parsed, pendulum.DateTime):
                return parsed.to_datetime_string()
            # If not a DateTime, convert to string and return
            return str(parsed)
        except Exception:
            return date_str

    @staticmethod

[docs]
    def scrape_article(url: str) -> str | None:
        """Scrapes the article text from the given URL.

        Args:
            url (str): The URL of the article.

        Returns:
            str | None: The text content of the article, or None if the article body is not found.
        """
        time.sleep(1)  # Sleep for 1 second to avoid rate limiting
        headers = {
            "accept": "*/*",
            "accept-encoding": "gzip, deflate, br",
            "accept-language": "en-US,en;q=0.9",
            "referer": "https://www.google.com",
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, \
                like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.44",
        }
        request = Requests().request(method="GET", url=url, headers=headers)
        soup = BeautifulSoup(request.text, "html.parser")
        caas_body = soup.find(class_="caas-body")
        return caas_body.text if caas_body is not None else None


    ########################################################
    # ////////////  static _truncate method  //////////////#
    ########################################################
    @staticmethod

[docs]
    def truncate(text: str, length: int) -> str:
        """Truncates a given text to a specified length.

        Args:
            text (str): The text to be truncated.
            length (int): The maximum length of the truncated text.

        Returns:
            str: The truncated text.
        """
        return (
            textwrap.shorten(text, length, placeholder="")
            if len(text) > length
            else text
        )



[docs]
    def get_news(self, symbol: str, limit: int = 6) -> list[dict[str, str]]:
        """Retrieves news articles related to a given symbol from Benzinga and Yahoo Finance.

        Note: Yahoo Finance has implemented anti-scraping measures that prevent fetching
        full article content. Yahoo news will include title, URL, publish date, and
        summary/description when available, but not full article text.

        Args:
            symbol (str): The symbol for which to retrieve news articles.
            limit (int, optional): The maximum number of news articles to retrieve. Defaults to 6.

        Returns:
            list: A list of news articles, sorted by publish date in descending order.
        """
        benzinga_news = self._get_benzinga_news(symbol=symbol, limit=limit)
        yahoo_news = self._get_yahoo_news(
            symbol=symbol,
            limit=(limit - len(benzinga_news[: (math.floor(limit / 2))])),
            scrape_content=False,
        )

        news = benzinga_news[: (math.floor(limit / 2))] + yahoo_news

        sorted_news = sorted(
            news, key=lambda x: pendulum.parse(x["publish_date"]), reverse=True
        )

        return sorted_news[:limit]


    def _get_yahoo_news(
        self, symbol: str, limit: int = 6, scrape_content: bool = False
    ) -> list[dict[str, str]]:
        """Retrieves the latest news articles related to a given symbol from Yahoo Finance.

        Args:
            symbol (str): The symbol for which to retrieve news articles.
            limit (int, optional): The maximum number of news articles to retrieve. Defaults to 6.
            scrape_content (bool, optional): Whether to attempt scraping full article content.
                                            Defaults to False due to Yahoo's anti-scraping measures.

        Returns:
            list: A list of dictionaries containing the news article details, including title, URL, source,
                  content (if available), publish date, and symbol.
        """
        ticker = yf.Ticker(symbol)
        news_response = ticker.news

        yahoo_news = []
        news_count = 0
        for news in news_response[:limit]:  # Limit the iteration
            try:
                news_content = news.get("content", {})

                # Extract the summary/description if available
                content = None
                if scrape_content:
                    # Only attempt scraping if explicitly requested
                    try:
                        scraped_article = self.scrape_article(
                            news_content.get("canonicalUrl", {}).get("url", "")
                        )
                        if scraped_article:
                            content = self.truncate(
                                self.strip_html(scraped_article), 8000
                            )
                    except Exception as scrape_error:
                        logger.debug(
                            f"Could not scrape article content: {scrape_error}"
                        )

                # Use the summary from the API if scraping failed or wasn't attempted
                if not content:
                    # Try to get summary from the news data itself
                    summary = news_content.get("summary", "")
                    if not summary:
                        # Some news items have description instead of summary
                        summary = news.get("summary", "")
                    content = self.truncate(summary, 8000) if summary else None

                yahoo_news.append(
                    {
                        "title": news_content.get(
                            "title", news.get("title", "No title")
                        ),
                        "url": news_content.get("canonicalUrl", {}).get(
                            "url", news.get("link", "")
                        ),
                        "source": "yahoo",
                        "content": content,
                        "publish_date": pendulum.from_timestamp(
                            news_content.get(
                                "pubDate", news.get("providerPublishTime", 0)
                            )
                        ).to_datetime_string()
                        if news_content.get("pubDate")
                        or news.get("providerPublishTime")
                        else pendulum.now().to_datetime_string(),
                        "symbol": symbol,
                    }
                )
                news_count += 1

            except Exception:
                logger.exception("Error processing Yahoo news item")
                continue

            if news_count >= limit:
                break

        return yahoo_news

    def _get_benzinga_news(
        self,
        symbol: str,
        start_date: str = START_DATE,
        end_date: str = END_DATE,
        include_content: bool = True,
        exclude_contentless: bool = True,
        limit: int = 10,
    ) -> list[dict[str, str]]:
        """Retrieves Benzinga news articles for a given symbol and date range.

        Args:
            symbol (str): The symbol for which to retrieve news articles.
            start_date (str, optional): The start date of the news articles. Defaults to START_DATE.
            end_date (str, optional): The end date of the news articles. Defaults to END_DATE.
            include_content (bool, optional): Whether to include the content of the news articles. Defaults to True.
            exclude_contentless (bool, optional): Whether to exclude news articles with no content. Defaults to True.
            limit (int, optional): The maximum number of news articles to retrieve. Defaults to 10.

        Returns:
            list: A list of dictionaries representing the retrieved news articles. Each dictionary contains the following keys:
                - "title": The title of the news article.
                - "url": The URL of the news article.
                - "source": The source of the news article (in this case, "benzinga").
                - "content": The content of the news article, or None if there is no content.
                - "publish_date": The publishing date of the news article.
                - "symbol": The symbol associated with the news article.
        """
        url = f"{self.news_url}"
        params: dict[str, str | bool | float | int] = {
            "symbols": symbol,
            "start": start_date,
            "end": end_date,
            "include_content": include_content,
            "exclude_contentless": exclude_contentless,
            "limit": limit,
        }
        response = json.loads(
            Requests()
            .request(method="GET", url=url, headers=self.headers, params=params)
            .text
        )

        benzinga_news = []
        for news in response["news"]:
            benzinga_news.append(
                {
                    "title": news["headline"],
                    "url": news["url"],
                    "source": "benzinga",
                    "content": self.strip_html(news["content"])
                    if news["content"]
                    else None,
                    "publish_date": self._parse_date_safe(news["created_at"]),
                    "symbol": symbol,
                }
            )

        return benzinga_news