mirror of
https://codeberg.org/Ewen/rudibridge.git
synced 2025-04-17 13:05:54 +00:00
- added tests (pytest) - use GET parameters to parse a page - return an actual RSS feed
114 lines
3.5 KiB
Python
114 lines
3.5 KiB
Python
from datetime import datetime
|
|
|
|
import botasaurus as bt
|
|
import dateparser
|
|
|
|
|
|
class FeedItem:
|
|
def __init__(self, title, content, link, item_datetime=None, author=None):
|
|
self.title = title
|
|
self.content = content
|
|
self.link = link
|
|
self.author = author
|
|
if item_datetime:
|
|
self.item_datetime = item_datetime.isoformat()
|
|
else:
|
|
self.item_datetime = None
|
|
|
|
def __lt__(self, other):
|
|
if self.item_datetime and other.item_datetime:
|
|
return self.item_datetime < other.item_datetime
|
|
elif not self.title or not other.title:
|
|
return True
|
|
else:
|
|
return self.title < other.title
|
|
|
|
class Feed:
|
|
def __init__(self, title, url, items):
|
|
self.title = title
|
|
self.url = url
|
|
self.items = items
|
|
|
|
|
|
|
|
@bt.request(output=None)
|
|
def scrape(request, args):
|
|
soup = request.bs4(args.get("url"), timeout=5)
|
|
|
|
# If section is provided, use it, else use the entire page/soup
|
|
section = soup
|
|
if args.get("section"):
|
|
section_class = None
|
|
if args.get("sectionClass"):
|
|
section_class = {"class": args.get("sectionClass")}
|
|
section = soup.find(args.get("section"), section_class)
|
|
|
|
article_class = None
|
|
if args.get("articleClass"):
|
|
article_class = {"class": args.get("articleClass")}
|
|
articles = section.find_all(args.get("article"), article_class)
|
|
|
|
feed = Feed(title=soup.title.get_text(), url=args.get("url"), items=[])
|
|
|
|
for article in articles:
|
|
title_class = None
|
|
if args.get("titleClass"):
|
|
title_class = {"class": args.get("titleClass")}
|
|
title = article.find(args.get("title"), title_class)
|
|
if title:
|
|
title = title.get_text()
|
|
|
|
content_tag = "p"
|
|
content_class = None
|
|
if args.get("content"):
|
|
content_tag = args.get("content")
|
|
if args.get("contentClass"):
|
|
content_class = {"class": args.get("contentClass")}
|
|
paragraphs = article.find_all(content_tag, content_class)
|
|
content = ""
|
|
if paragraphs:
|
|
content = "<br>".join([p.get_text() for p in paragraphs])
|
|
|
|
link_tag = "a"
|
|
link_class = None
|
|
if args.get("link"):
|
|
link_tag = args.get("link")
|
|
if args.get("linkClass"):
|
|
link_class = {"class": args.get("linkClass")}
|
|
link = article.find(link_tag, link_class)
|
|
if link:
|
|
link = link["href"]
|
|
|
|
item_datetime = None
|
|
item_datetime_class = None
|
|
if args.get("datetime"):
|
|
if args.get("datetimeClass"):
|
|
item_datetime_class = {"class": args.get("datetimeClass")}
|
|
item_datetime = article.find(args.get("datetime"), item_datetime_class)
|
|
if item_datetime:
|
|
item_datetime = dateparser.parse(item_datetime["datetime"])
|
|
|
|
author = None
|
|
author_class = None
|
|
if args.get("author"):
|
|
if args.get("authorClass"):
|
|
author_class = {"class": args.get("authorClass")}
|
|
author = article.find(args.get("author"), author_class)
|
|
|
|
item = FeedItem(
|
|
title=title,
|
|
content=content,
|
|
link=link,
|
|
item_datetime=item_datetime,
|
|
author=author
|
|
)
|
|
|
|
if item.title is not None:
|
|
feed.items.append(item)
|
|
|
|
# Sort by datetime if any
|
|
if args.get("datetime"):
|
|
feed.items.sort(reverse=True)
|
|
|
|
return feed
|