From 94547f203116e50e37b03df2f0da5595ea8ef7cd Mon Sep 17 00:00:00 2001 From: Ewen Date: Wed, 8 May 2024 10:13:54 +0200 Subject: [PATCH] Leap forward - added tests (pytest) - use GET parameters to parse a page - return an actual RSS feed --- api/api/app.py | 11 ----- api/api/feed.py | 50 ++++++++++++++++++--- api/api/scraper.py | 86 ++++++++++++++++++++++++++++-------- api/api/templates/rss.xml | 9 ++-- api/poetry.lock | 13 +++++- api/pyproject.toml | 1 + api/tests/conftest.py | 15 +++++++ api/tests/test_parameters.py | 35 +++++++++++++++ 8 files changed, 182 insertions(+), 38 deletions(-) delete mode 100644 api/api/app.py create mode 100644 api/tests/conftest.py create mode 100644 api/tests/test_parameters.py diff --git a/api/api/app.py b/api/api/app.py deleted file mode 100644 index ccddca2..0000000 --- a/api/api/app.py +++ /dev/null @@ -1,11 +0,0 @@ -from flask import Flask - -app = Flask(__name__) - -@app.route("/") -def hello(): - return "

Coucou.

" - - -if __name__ == "__main__": - app.run() \ No newline at end of file diff --git a/api/api/feed.py b/api/api/feed.py index d62b6e9..8c9b56c 100644 --- a/api/api/feed.py +++ b/api/api/feed.py @@ -1,6 +1,8 @@ from datetime import datetime -from flask import Blueprint, make_response, render_template +import validators +from flask import Blueprint, jsonify, make_response, render_template, request +from werkzeug.exceptions import HTTPException from api.db import get_db @@ -9,11 +11,49 @@ from .scraper import scrape bp = Blueprint("feed", __name__, url_prefix="/feed") +class InvalidParameters(Exception): + status_code = 400 + + def __init__(self, message, status_code=None, payload=None): + super().__init__() + self.message = message + if status_code is not None: + self.status_code = status_code + self.payload = payload + + def to_dict(self): + rv = dict(self.payload or ()) + rv["message"] = self.message + return rv + + +@bp.errorhandler(InvalidParameters) +def invalid_parameters(e): + return jsonify(e.to_dict()), e.status_code + + @bp.route("/", methods=("GET",)) def parse_page(): - link = "https://www.ouest-france.fr/bretagne/rennes-35000/" - feed = scrape(link) + args = dict(request.args) + + # Checking if mandatory parameters are present + if not args.get("url"): + raise InvalidParameters("Missing parameter: URL") + + if not args.get("title"): + raise InvalidParameters("Missing parameter: title") + + if not args.get("article"): + raise InvalidParameters("Missing parameter: article") + + # Checking for correctness + if not args.get("url").startswith("https"): + args["url"] = "https://" + args.get("url") + if not validators.url(args.get("url")): + raise InvalidParameters("Incorrect URL") + + feed = scrape(args) rss_xml = render_template("rss.xml", feed=feed, build_date=datetime.now()) response = make_response(rss_xml) - response.headers['Content-Type'] = "application/rss+xml" - return response \ No newline at end of file + response.headers["Content-Type"] = "application/rss+xml" + return response diff --git a/api/api/scraper.py b/api/api/scraper.py index a845354..15325fb 100644 --- a/api/api/scraper.py +++ b/api/api/scraper.py @@ -5,16 +5,21 @@ import dateparser class FeedItem: - def __init__(self, title, content, author, link, item_datetime=datetime.now()): + def __init__(self, title, content, link, item_datetime=None, author=None): self.title = title self.content = content - self.author = author self.link = link - self.item_datetime = item_datetime.isoformat() + self.author = author + if item_datetime: + self.item_datetime = item_datetime.isoformat() + else: + self.item_datetime = None def __lt__(self, other): if self.item_datetime and other.item_datetime: return self.item_datetime < other.item_datetime + elif not self.title or not other.title: + return True else: return self.title < other.title @@ -27,37 +32,82 @@ class Feed: @bt.request(output=None) -def scrape(request, link): - soup = request.bs4(link) - section = soup.find("section", {"class": "liste-articles"}) - articles = section.find_all("article", {"class": "teaser-media-liste"}) +def scrape(request, args): + soup = request.bs4(args.get("url"), timeout=5) - feed = Feed(title=soup.title.get_text(), url=link, items=[]) + # If section is provided, use it, else use the entire page/soup + section = soup + if args.get("section"): + section_class = None + if args.get("sectionClass"): + section_class = {"class": args.get("sectionClass")} + section = soup.find(args.get("section"), section_class) + + article_class = None + if args.get("articleClass"): + article_class = {"class": args.get("articleClass")} + articles = section.find_all(args.get("article"), article_class) + + feed = Feed(title=soup.title.get_text(), url=args.get("url"), items=[]) for article in articles: - title = article.find("h2") + title_class = None + if args.get("titleClass"): + title_class = {"class": args.get("titleClass")} + title = article.find(args.get("title"), title_class) if title: title = title.get_text() - content = article.find("p") - if content: - content = content.get_text() + content_tag = "p" + content_class = None + if args.get("content"): + content_tag = args.get("content") + if args.get("contentClass"): + content_class = {"class": args.get("contentClass")} + paragraphs = article.find_all(content_tag, content_class) + content = "" + if paragraphs: + content = "
".join([p.get_text() for p in paragraphs]) - link = article.find("a", {"class": "titre-lien"}) + link_tag = "a" + link_class = None + if args.get("link"): + link_tag = args.get("link") + if args.get("linkClass"): + link_class = {"class": args.get("linkClass")} + link = article.find(link_tag, link_class) if link: link = link["href"] - item_datetime = article.find("time") - if item_datetime: - item_datetime = dateparser.parse(item_datetime["datetime"]) + item_datetime = None + item_datetime_class = None + if args.get("datetime"): + if args.get("datetimeClass"): + item_datetime_class = {"class": args.get("datetimeClass")} + item_datetime = article.find(args.get("datetime"), item_datetime_class) + if item_datetime: + item_datetime = dateparser.parse(item_datetime["datetime"]) + + author = None + author_class = None + if args.get("author"): + if args.get("authorClass"): + author_class = {"class": args.get("authorClass")} + author = article.find(args.get("author"), author_class) + item = FeedItem( title=title, content=content, - author="Ouest-France", link=link, item_datetime=item_datetime, + author=author ) - feed.items.append(item) + + if item.title is not None: + feed.items.append(item) + + # Sort by datetime if any + if args.get("datetime"): feed.items.sort(reverse=True) return feed diff --git a/api/api/templates/rss.xml b/api/api/templates/rss.xml index 0eee2a9..964b83a 100644 --- a/api/api/templates/rss.xml +++ b/api/api/templates/rss.xml @@ -2,9 +2,9 @@ {{ feed.title }} - - {{ request.base_url }} - A feed generated from {{feed.url}} with Rudibridge + + {{ request.url }} + A feed generated from {{ feed.url }} with Rudibridge {{ build_date.strftime("%a, %d %b %Y %T") }} +0000 {% for item in feed.items %} @@ -21,6 +21,9 @@ {% if item.item_datetime %} {{ item.item_datetime }} {% endif %} + {% if item.author %} + {{ item.author }} + {% endif %} {% endfor %} diff --git a/api/poetry.lock b/api/poetry.lock index ee25d29..38df1e9 100644 --- a/api/poetry.lock +++ b/api/poetry.lock @@ -1149,6 +1149,17 @@ brotli = ["brotli (==1.0.9)", "brotli (>=1.0.9)", "brotlicffi (>=0.8.0)", "brotl secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress", "pyOpenSSL (>=0.14)", "urllib3-secure-extra"] socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] +[[package]] +name = "validators" +version = "0.28.1" +description = "Python Data Validation for Humans™" +optional = false +python-versions = ">=3.8" +files = [ + {file = "validators-0.28.1-py3-none-any.whl", hash = "sha256:890c98789ad884037f059af6ea915ec2d667129d509180c2c590b8009a4c4219"}, + {file = "validators-0.28.1.tar.gz", hash = "sha256:5ac88e7916c3405f0ce38ac2ac82a477fcf4d90dbbeddd04c8193171fc17f7dc"}, +] + [[package]] name = "werkzeug" version = "3.0.2" @@ -1183,4 +1194,4 @@ h11 = ">=0.9.0,<1" [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "570ed3ea32a7c870f94ad73cdc122034842353bbef4e7c6111dcd150f79394ac" +content-hash = "d0e30c9d7e8186fec20b1716f1a47e7e759e26cd68c5af13bd1ef4b9259426dd" diff --git a/api/pyproject.toml b/api/pyproject.toml index 5e9b18d..8c26b42 100644 --- a/api/pyproject.toml +++ b/api/pyproject.toml @@ -15,6 +15,7 @@ coverage = "^7.5.1" requests = "^2.31.0" botasaurus = "^4.0.14" dateparser = "^1.2.0" +validators = "^0.28.1" [build-system] diff --git a/api/tests/conftest.py b/api/tests/conftest.py new file mode 100644 index 0000000..db3bacf --- /dev/null +++ b/api/tests/conftest.py @@ -0,0 +1,15 @@ +import pytest +from api import create_app + +@pytest.fixture() +def app(): + app = create_app() + app.config.update({ + "TESTING": True + }) + + yield app + +@pytest.fixture() +def client(app): + return app.test_client() \ No newline at end of file diff --git a/api/tests/test_parameters.py b/api/tests/test_parameters.py new file mode 100644 index 0000000..885a846 --- /dev/null +++ b/api/tests/test_parameters.py @@ -0,0 +1,35 @@ +import pytest + + +@pytest.mark.parametrize( + "url,missing_parameter", + [ + ("/feed/", "URL"), + ("/feed/?url=https://mozilla.org", "title"), + ("/feed/?url=https://mozilla.org&title=h2", "article"), + ("/feed/?url=https://mozilla.org&title=h2&article=article", None), + ], +) +def test_missing_parameters(client, url, missing_parameter): + response = client.get(url) + + if missing_parameter: + assert response.json["message"] == f"Missing parameter: {missing_parameter}" + assert response.status_code == 400 + else: + assert response.status_code == 200 + + +@pytest.mark.parametrize( + "url,status_code,message", + [ + ("https://mozilla.org", 200, None), + ("mozilla.org", 200, None), + ("toto", 400, "Incorrect URL"), + ], +) +def test_incorrect_url(client, url, status_code, message): + response = client.get(f"/feed/?url={url}&title=h2&article=article") + assert response.status_code == status_code + if message: + assert response.json["message"] == message