Source code for py_alpaca_api.trading.news

import json
import logging
import math
import textwrap
import time

import pendulum
import yfinance as yf
from bs4 import BeautifulSoup

from py_alpaca_api.http.requests import Requests

[docs] logger = logging.getLogger(__name__)
# Disable yfinance logging
[docs] yfinance_logger = logging.getLogger("yfinance")
yfinance_logger.disabled = True yfinance_logger.propagate = False
[docs] START_DATE = pendulum.now().subtract(days=14).to_date_string()
[docs] END_DATE = pendulum.now().to_date_string()
[docs] class News: def __init__(self, headers: dict[str, str]) -> None: self.news_url = "https://data.alpaca.markets/v1beta1/news" self.headers = headers @staticmethod
[docs] def strip_html(content: str): """Removes HTML tags and returns the stripped content. Args: content (str): The HTML content to be stripped. Returns: str: The stripped content without HTML tags. """ soup = BeautifulSoup(content, "html.parser") for data in soup(["style", "script"]): data.decompose() return " ".join(soup.stripped_strings)
@staticmethod def _parse_date_safe(date_str: str) -> str: """Safely parse a date string with pendulum.""" try: parsed = pendulum.parse(date_str) if isinstance(parsed, pendulum.DateTime): return parsed.to_datetime_string() # If not a DateTime, convert to string and return return str(parsed) except Exception: return date_str @staticmethod
[docs] def scrape_article(url: str) -> str | None: """Scrapes the article text from the given URL. Args: url (str): The URL of the article. Returns: str | None: The text content of the article, or None if the article body is not found. """ time.sleep(1) # Sleep for 1 second to avoid rate limiting headers = { "accept": "*/*", "accept-encoding": "gzip, deflate, br", "accept-language": "en-US,en;q=0.9", "referer": "https://www.google.com", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, \ like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.44", } request = Requests().request(method="GET", url=url, headers=headers) soup = BeautifulSoup(request.text, "html.parser") caas_body = soup.find(class_="caas-body") return caas_body.text if caas_body is not None else None
######################################################## # //////////// static _truncate method //////////////# ######################################################## @staticmethod
[docs] def truncate(text: str, length: int) -> str: """Truncates a given text to a specified length. Args: text (str): The text to be truncated. length (int): The maximum length of the truncated text. Returns: str: The truncated text. """ return ( textwrap.shorten(text, length, placeholder="") if len(text) > length else text )
[docs] def get_news(self, symbol: str, limit: int = 6) -> list[dict[str, str]]: """Retrieves news articles related to a given symbol from Benzinga and Yahoo Finance. Note: Yahoo Finance has implemented anti-scraping measures that prevent fetching full article content. Yahoo news will include title, URL, publish date, and summary/description when available, but not full article text. Args: symbol (str): The symbol for which to retrieve news articles. limit (int, optional): The maximum number of news articles to retrieve. Defaults to 6. Returns: list: A list of news articles, sorted by publish date in descending order. """ benzinga_news = self._get_benzinga_news(symbol=symbol, limit=limit) yahoo_news = self._get_yahoo_news( symbol=symbol, limit=(limit - len(benzinga_news[: (math.floor(limit / 2))])), scrape_content=False, ) news = benzinga_news[: (math.floor(limit / 2))] + yahoo_news sorted_news = sorted( news, key=lambda x: pendulum.parse(x["publish_date"]), reverse=True ) return sorted_news[:limit]
def _get_yahoo_news( self, symbol: str, limit: int = 6, scrape_content: bool = False ) -> list[dict[str, str]]: """Retrieves the latest news articles related to a given symbol from Yahoo Finance. Args: symbol (str): The symbol for which to retrieve news articles. limit (int, optional): The maximum number of news articles to retrieve. Defaults to 6. scrape_content (bool, optional): Whether to attempt scraping full article content. Defaults to False due to Yahoo's anti-scraping measures. Returns: list: A list of dictionaries containing the news article details, including title, URL, source, content (if available), publish date, and symbol. """ ticker = yf.Ticker(symbol) news_response = ticker.news yahoo_news = [] news_count = 0 for news in news_response[:limit]: # Limit the iteration try: news_content = news.get("content", {}) # Extract the summary/description if available content = None if scrape_content: # Only attempt scraping if explicitly requested try: scraped_article = self.scrape_article( news_content.get("canonicalUrl", {}).get("url", "") ) if scraped_article: content = self.truncate( self.strip_html(scraped_article), 8000 ) except Exception as scrape_error: logger.debug( f"Could not scrape article content: {scrape_error}" ) # Use the summary from the API if scraping failed or wasn't attempted if not content: # Try to get summary from the news data itself summary = news_content.get("summary", "") if not summary: # Some news items have description instead of summary summary = news.get("summary", "") content = self.truncate(summary, 8000) if summary else None yahoo_news.append( { "title": news_content.get( "title", news.get("title", "No title") ), "url": news_content.get("canonicalUrl", {}).get( "url", news.get("link", "") ), "source": "yahoo", "content": content, "publish_date": pendulum.from_timestamp( news_content.get( "pubDate", news.get("providerPublishTime", 0) ) ).to_datetime_string() if news_content.get("pubDate") or news.get("providerPublishTime") else pendulum.now().to_datetime_string(), "symbol": symbol, } ) news_count += 1 except Exception: logger.exception("Error processing Yahoo news item") continue if news_count >= limit: break return yahoo_news def _get_benzinga_news( self, symbol: str, start_date: str = START_DATE, end_date: str = END_DATE, include_content: bool = True, exclude_contentless: bool = True, limit: int = 10, ) -> list[dict[str, str]]: """Retrieves Benzinga news articles for a given symbol and date range. Args: symbol (str): The symbol for which to retrieve news articles. start_date (str, optional): The start date of the news articles. Defaults to START_DATE. end_date (str, optional): The end date of the news articles. Defaults to END_DATE. include_content (bool, optional): Whether to include the content of the news articles. Defaults to True. exclude_contentless (bool, optional): Whether to exclude news articles with no content. Defaults to True. limit (int, optional): The maximum number of news articles to retrieve. Defaults to 10. Returns: list: A list of dictionaries representing the retrieved news articles. Each dictionary contains the following keys: - "title": The title of the news article. - "url": The URL of the news article. - "source": The source of the news article (in this case, "benzinga"). - "content": The content of the news article, or None if there is no content. - "publish_date": The publishing date of the news article. - "symbol": The symbol associated with the news article. """ url = f"{self.news_url}" params: dict[str, str | bool | float | int] = { "symbols": symbol, "start": start_date, "end": end_date, "include_content": include_content, "exclude_contentless": exclude_contentless, "limit": limit, } response = json.loads( Requests() .request(method="GET", url=url, headers=self.headers, params=params) .text ) benzinga_news = [] for news in response["news"]: benzinga_news.append( { "title": news["headline"], "url": news["url"], "source": "benzinga", "content": self.strip_html(news["content"]) if news["content"] else None, "publish_date": self._parse_date_safe(news["created_at"]), "symbol": symbol, } ) return benzinga_news