mirror of
https://codeberg.org/Ewen/rudibridge.git
synced 2024-12-22 11:12:32 +00:00
Leap forward
- added tests (pytest) - use GET parameters to parse a page - return an actual RSS feed
This commit is contained in:
parent
8aebaca9ad
commit
94547f2031
|
@ -1,11 +0,0 @@
|
||||||
from flask import Flask
|
|
||||||
|
|
||||||
app = Flask(__name__)
|
|
||||||
|
|
||||||
@app.route("/")
|
|
||||||
def hello():
|
|
||||||
return "<p>Coucou.</p>"
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
app.run()
|
|
|
@ -1,6 +1,8 @@
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
from flask import Blueprint, make_response, render_template
|
import validators
|
||||||
|
from flask import Blueprint, jsonify, make_response, render_template, request
|
||||||
|
from werkzeug.exceptions import HTTPException
|
||||||
|
|
||||||
from api.db import get_db
|
from api.db import get_db
|
||||||
|
|
||||||
|
@ -9,11 +11,49 @@ from .scraper import scrape
|
||||||
bp = Blueprint("feed", __name__, url_prefix="/feed")
|
bp = Blueprint("feed", __name__, url_prefix="/feed")
|
||||||
|
|
||||||
|
|
||||||
|
class InvalidParameters(Exception):
|
||||||
|
status_code = 400
|
||||||
|
|
||||||
|
def __init__(self, message, status_code=None, payload=None):
|
||||||
|
super().__init__()
|
||||||
|
self.message = message
|
||||||
|
if status_code is not None:
|
||||||
|
self.status_code = status_code
|
||||||
|
self.payload = payload
|
||||||
|
|
||||||
|
def to_dict(self):
|
||||||
|
rv = dict(self.payload or ())
|
||||||
|
rv["message"] = self.message
|
||||||
|
return rv
|
||||||
|
|
||||||
|
|
||||||
|
@bp.errorhandler(InvalidParameters)
|
||||||
|
def invalid_parameters(e):
|
||||||
|
return jsonify(e.to_dict()), e.status_code
|
||||||
|
|
||||||
|
|
||||||
@bp.route("/", methods=("GET",))
|
@bp.route("/", methods=("GET",))
|
||||||
def parse_page():
|
def parse_page():
|
||||||
link = "https://www.ouest-france.fr/bretagne/rennes-35000/"
|
args = dict(request.args)
|
||||||
feed = scrape(link)
|
|
||||||
|
# Checking if mandatory parameters are present
|
||||||
|
if not args.get("url"):
|
||||||
|
raise InvalidParameters("Missing parameter: URL")
|
||||||
|
|
||||||
|
if not args.get("title"):
|
||||||
|
raise InvalidParameters("Missing parameter: title")
|
||||||
|
|
||||||
|
if not args.get("article"):
|
||||||
|
raise InvalidParameters("Missing parameter: article")
|
||||||
|
|
||||||
|
# Checking for correctness
|
||||||
|
if not args.get("url").startswith("https"):
|
||||||
|
args["url"] = "https://" + args.get("url")
|
||||||
|
if not validators.url(args.get("url")):
|
||||||
|
raise InvalidParameters("Incorrect URL")
|
||||||
|
|
||||||
|
feed = scrape(args)
|
||||||
rss_xml = render_template("rss.xml", feed=feed, build_date=datetime.now())
|
rss_xml = render_template("rss.xml", feed=feed, build_date=datetime.now())
|
||||||
response = make_response(rss_xml)
|
response = make_response(rss_xml)
|
||||||
response.headers['Content-Type'] = "application/rss+xml"
|
response.headers["Content-Type"] = "application/rss+xml"
|
||||||
return response
|
return response
|
||||||
|
|
|
@ -5,16 +5,21 @@ import dateparser
|
||||||
|
|
||||||
|
|
||||||
class FeedItem:
|
class FeedItem:
|
||||||
def __init__(self, title, content, author, link, item_datetime=datetime.now()):
|
def __init__(self, title, content, link, item_datetime=None, author=None):
|
||||||
self.title = title
|
self.title = title
|
||||||
self.content = content
|
self.content = content
|
||||||
self.author = author
|
|
||||||
self.link = link
|
self.link = link
|
||||||
self.item_datetime = item_datetime.isoformat()
|
self.author = author
|
||||||
|
if item_datetime:
|
||||||
|
self.item_datetime = item_datetime.isoformat()
|
||||||
|
else:
|
||||||
|
self.item_datetime = None
|
||||||
|
|
||||||
def __lt__(self, other):
|
def __lt__(self, other):
|
||||||
if self.item_datetime and other.item_datetime:
|
if self.item_datetime and other.item_datetime:
|
||||||
return self.item_datetime < other.item_datetime
|
return self.item_datetime < other.item_datetime
|
||||||
|
elif not self.title or not other.title:
|
||||||
|
return True
|
||||||
else:
|
else:
|
||||||
return self.title < other.title
|
return self.title < other.title
|
||||||
|
|
||||||
|
@ -27,37 +32,82 @@ class Feed:
|
||||||
|
|
||||||
|
|
||||||
@bt.request(output=None)
|
@bt.request(output=None)
|
||||||
def scrape(request, link):
|
def scrape(request, args):
|
||||||
soup = request.bs4(link)
|
soup = request.bs4(args.get("url"), timeout=5)
|
||||||
section = soup.find("section", {"class": "liste-articles"})
|
|
||||||
articles = section.find_all("article", {"class": "teaser-media-liste"})
|
|
||||||
|
|
||||||
feed = Feed(title=soup.title.get_text(), url=link, items=[])
|
# If section is provided, use it, else use the entire page/soup
|
||||||
|
section = soup
|
||||||
|
if args.get("section"):
|
||||||
|
section_class = None
|
||||||
|
if args.get("sectionClass"):
|
||||||
|
section_class = {"class": args.get("sectionClass")}
|
||||||
|
section = soup.find(args.get("section"), section_class)
|
||||||
|
|
||||||
|
article_class = None
|
||||||
|
if args.get("articleClass"):
|
||||||
|
article_class = {"class": args.get("articleClass")}
|
||||||
|
articles = section.find_all(args.get("article"), article_class)
|
||||||
|
|
||||||
|
feed = Feed(title=soup.title.get_text(), url=args.get("url"), items=[])
|
||||||
|
|
||||||
for article in articles:
|
for article in articles:
|
||||||
title = article.find("h2")
|
title_class = None
|
||||||
|
if args.get("titleClass"):
|
||||||
|
title_class = {"class": args.get("titleClass")}
|
||||||
|
title = article.find(args.get("title"), title_class)
|
||||||
if title:
|
if title:
|
||||||
title = title.get_text()
|
title = title.get_text()
|
||||||
|
|
||||||
content = article.find("p")
|
content_tag = "p"
|
||||||
if content:
|
content_class = None
|
||||||
content = content.get_text()
|
if args.get("content"):
|
||||||
|
content_tag = args.get("content")
|
||||||
|
if args.get("contentClass"):
|
||||||
|
content_class = {"class": args.get("contentClass")}
|
||||||
|
paragraphs = article.find_all(content_tag, content_class)
|
||||||
|
content = ""
|
||||||
|
if paragraphs:
|
||||||
|
content = "<br>".join([p.get_text() for p in paragraphs])
|
||||||
|
|
||||||
link = article.find("a", {"class": "titre-lien"})
|
link_tag = "a"
|
||||||
|
link_class = None
|
||||||
|
if args.get("link"):
|
||||||
|
link_tag = args.get("link")
|
||||||
|
if args.get("linkClass"):
|
||||||
|
link_class = {"class": args.get("linkClass")}
|
||||||
|
link = article.find(link_tag, link_class)
|
||||||
if link:
|
if link:
|
||||||
link = link["href"]
|
link = link["href"]
|
||||||
|
|
||||||
item_datetime = article.find("time")
|
item_datetime = None
|
||||||
if item_datetime:
|
item_datetime_class = None
|
||||||
item_datetime = dateparser.parse(item_datetime["datetime"])
|
if args.get("datetime"):
|
||||||
|
if args.get("datetimeClass"):
|
||||||
|
item_datetime_class = {"class": args.get("datetimeClass")}
|
||||||
|
item_datetime = article.find(args.get("datetime"), item_datetime_class)
|
||||||
|
if item_datetime:
|
||||||
|
item_datetime = dateparser.parse(item_datetime["datetime"])
|
||||||
|
|
||||||
|
author = None
|
||||||
|
author_class = None
|
||||||
|
if args.get("author"):
|
||||||
|
if args.get("authorClass"):
|
||||||
|
author_class = {"class": args.get("authorClass")}
|
||||||
|
author = article.find(args.get("author"), author_class)
|
||||||
|
|
||||||
item = FeedItem(
|
item = FeedItem(
|
||||||
title=title,
|
title=title,
|
||||||
content=content,
|
content=content,
|
||||||
author="Ouest-France",
|
|
||||||
link=link,
|
link=link,
|
||||||
item_datetime=item_datetime,
|
item_datetime=item_datetime,
|
||||||
|
author=author
|
||||||
)
|
)
|
||||||
feed.items.append(item)
|
|
||||||
|
if item.title is not None:
|
||||||
|
feed.items.append(item)
|
||||||
|
|
||||||
|
# Sort by datetime if any
|
||||||
|
if args.get("datetime"):
|
||||||
feed.items.sort(reverse=True)
|
feed.items.sort(reverse=True)
|
||||||
|
|
||||||
return feed
|
return feed
|
||||||
|
|
|
@ -2,9 +2,9 @@
|
||||||
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
|
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
|
||||||
<channel>
|
<channel>
|
||||||
<title>{{ feed.title }}</title>
|
<title>{{ feed.title }}</title>
|
||||||
<atom:link href="{{ request.base_url }}" rel="self" type="application/rss+xml"/>
|
<atom:link href="{{ request.url }}" rel="self" type="application/rss+xml"/>
|
||||||
<link>{{ request.base_url }}</link>
|
<link>{{ request.url }}</link>
|
||||||
<description>A feed generated from {{feed.url}} with Rudibridge</description>
|
<description>A feed generated from {{ feed.url }} with Rudibridge</description>
|
||||||
<lastBuildDate>{{ build_date.strftime("%a, %d %b %Y %T") }} +0000</lastBuildDate>
|
<lastBuildDate>{{ build_date.strftime("%a, %d %b %Y %T") }} +0000</lastBuildDate>
|
||||||
{% for item in feed.items %}
|
{% for item in feed.items %}
|
||||||
<item>
|
<item>
|
||||||
|
@ -21,6 +21,9 @@
|
||||||
{% if item.item_datetime %}
|
{% if item.item_datetime %}
|
||||||
<pubDate>{{ item.item_datetime }}</pubDate>
|
<pubDate>{{ item.item_datetime }}</pubDate>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
{% if item.author %}
|
||||||
|
<author>{{ item.author }}</author>
|
||||||
|
{% endif %}
|
||||||
</item>
|
</item>
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
</channel>
|
</channel>
|
||||||
|
|
13
api/poetry.lock
generated
13
api/poetry.lock
generated
|
@ -1149,6 +1149,17 @@ brotli = ["brotli (==1.0.9)", "brotli (>=1.0.9)", "brotlicffi (>=0.8.0)", "brotl
|
||||||
secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress", "pyOpenSSL (>=0.14)", "urllib3-secure-extra"]
|
secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress", "pyOpenSSL (>=0.14)", "urllib3-secure-extra"]
|
||||||
socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"]
|
socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "validators"
|
||||||
|
version = "0.28.1"
|
||||||
|
description = "Python Data Validation for Humans™"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.8"
|
||||||
|
files = [
|
||||||
|
{file = "validators-0.28.1-py3-none-any.whl", hash = "sha256:890c98789ad884037f059af6ea915ec2d667129d509180c2c590b8009a4c4219"},
|
||||||
|
{file = "validators-0.28.1.tar.gz", hash = "sha256:5ac88e7916c3405f0ce38ac2ac82a477fcf4d90dbbeddd04c8193171fc17f7dc"},
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "werkzeug"
|
name = "werkzeug"
|
||||||
version = "3.0.2"
|
version = "3.0.2"
|
||||||
|
@ -1183,4 +1194,4 @@ h11 = ">=0.9.0,<1"
|
||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = "^3.11"
|
python-versions = "^3.11"
|
||||||
content-hash = "570ed3ea32a7c870f94ad73cdc122034842353bbef4e7c6111dcd150f79394ac"
|
content-hash = "d0e30c9d7e8186fec20b1716f1a47e7e759e26cd68c5af13bd1ef4b9259426dd"
|
||||||
|
|
|
@ -15,6 +15,7 @@ coverage = "^7.5.1"
|
||||||
requests = "^2.31.0"
|
requests = "^2.31.0"
|
||||||
botasaurus = "^4.0.14"
|
botasaurus = "^4.0.14"
|
||||||
dateparser = "^1.2.0"
|
dateparser = "^1.2.0"
|
||||||
|
validators = "^0.28.1"
|
||||||
|
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
|
|
15
api/tests/conftest.py
Normal file
15
api/tests/conftest.py
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
import pytest
|
||||||
|
from api import create_app
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def app():
|
||||||
|
app = create_app()
|
||||||
|
app.config.update({
|
||||||
|
"TESTING": True
|
||||||
|
})
|
||||||
|
|
||||||
|
yield app
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def client(app):
|
||||||
|
return app.test_client()
|
35
api/tests/test_parameters.py
Normal file
35
api/tests/test_parameters.py
Normal file
|
@ -0,0 +1,35 @@
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"url,missing_parameter",
|
||||||
|
[
|
||||||
|
("/feed/", "URL"),
|
||||||
|
("/feed/?url=https://mozilla.org", "title"),
|
||||||
|
("/feed/?url=https://mozilla.org&title=h2", "article"),
|
||||||
|
("/feed/?url=https://mozilla.org&title=h2&article=article", None),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_missing_parameters(client, url, missing_parameter):
|
||||||
|
response = client.get(url)
|
||||||
|
|
||||||
|
if missing_parameter:
|
||||||
|
assert response.json["message"] == f"Missing parameter: {missing_parameter}"
|
||||||
|
assert response.status_code == 400
|
||||||
|
else:
|
||||||
|
assert response.status_code == 200
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"url,status_code,message",
|
||||||
|
[
|
||||||
|
("https://mozilla.org", 200, None),
|
||||||
|
("mozilla.org", 200, None),
|
||||||
|
("toto", 400, "Incorrect URL"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_incorrect_url(client, url, status_code, message):
|
||||||
|
response = client.get(f"/feed/?url={url}&title=h2&article=article")
|
||||||
|
assert response.status_code == status_code
|
||||||
|
if message:
|
||||||
|
assert response.json["message"] == message
|
Loading…
Reference in a new issue