1
0
Fork 0
mirror of https://codeberg.org/Ewen/rudibridge.git synced 2025-04-17 13:05:54 +00:00
rudibridge-mirror/api/api/scraper.py
Ewen 94547f2031 Leap forward
- added tests (pytest)
- use GET parameters to parse a page
- return an actual RSS feed
2024-05-08 10:13:54 +02:00

114 lines
3.5 KiB
Python

from datetime import datetime
import botasaurus as bt
import dateparser
class FeedItem:
def __init__(self, title, content, link, item_datetime=None, author=None):
self.title = title
self.content = content
self.link = link
self.author = author
if item_datetime:
self.item_datetime = item_datetime.isoformat()
else:
self.item_datetime = None
def __lt__(self, other):
if self.item_datetime and other.item_datetime:
return self.item_datetime < other.item_datetime
elif not self.title or not other.title:
return True
else:
return self.title < other.title
class Feed:
def __init__(self, title, url, items):
self.title = title
self.url = url
self.items = items
@bt.request(output=None)
def scrape(request, args):
soup = request.bs4(args.get("url"), timeout=5)
# If section is provided, use it, else use the entire page/soup
section = soup
if args.get("section"):
section_class = None
if args.get("sectionClass"):
section_class = {"class": args.get("sectionClass")}
section = soup.find(args.get("section"), section_class)
article_class = None
if args.get("articleClass"):
article_class = {"class": args.get("articleClass")}
articles = section.find_all(args.get("article"), article_class)
feed = Feed(title=soup.title.get_text(), url=args.get("url"), items=[])
for article in articles:
title_class = None
if args.get("titleClass"):
title_class = {"class": args.get("titleClass")}
title = article.find(args.get("title"), title_class)
if title:
title = title.get_text()
content_tag = "p"
content_class = None
if args.get("content"):
content_tag = args.get("content")
if args.get("contentClass"):
content_class = {"class": args.get("contentClass")}
paragraphs = article.find_all(content_tag, content_class)
content = ""
if paragraphs:
content = "<br>".join([p.get_text() for p in paragraphs])
link_tag = "a"
link_class = None
if args.get("link"):
link_tag = args.get("link")
if args.get("linkClass"):
link_class = {"class": args.get("linkClass")}
link = article.find(link_tag, link_class)
if link:
link = link["href"]
item_datetime = None
item_datetime_class = None
if args.get("datetime"):
if args.get("datetimeClass"):
item_datetime_class = {"class": args.get("datetimeClass")}
item_datetime = article.find(args.get("datetime"), item_datetime_class)
if item_datetime:
item_datetime = dateparser.parse(item_datetime["datetime"])
author = None
author_class = None
if args.get("author"):
if args.get("authorClass"):
author_class = {"class": args.get("authorClass")}
author = article.find(args.get("author"), author_class)
item = FeedItem(
title=title,
content=content,
link=link,
item_datetime=item_datetime,
author=author
)
if item.title is not None:
feed.items.append(item)
# Sort by datetime if any
if args.get("datetime"):
feed.items.sort(reverse=True)
return feed