diff --git a/api/api/app.py b/api/api/app.py
deleted file mode 100644
index ccddca2..0000000
--- a/api/api/app.py
+++ /dev/null
@@ -1,11 +0,0 @@
-from flask import Flask
-
-app = Flask(__name__)
-
-@app.route("/")
-def hello():
- return "
Coucou.
"
-
-
-if __name__ == "__main__":
- app.run()
\ No newline at end of file
diff --git a/api/api/feed.py b/api/api/feed.py
index d62b6e9..8c9b56c 100644
--- a/api/api/feed.py
+++ b/api/api/feed.py
@@ -1,6 +1,8 @@
from datetime import datetime
-from flask import Blueprint, make_response, render_template
+import validators
+from flask import Blueprint, jsonify, make_response, render_template, request
+from werkzeug.exceptions import HTTPException
from api.db import get_db
@@ -9,11 +11,49 @@ from .scraper import scrape
bp = Blueprint("feed", __name__, url_prefix="/feed")
+class InvalidParameters(Exception):
+ status_code = 400
+
+ def __init__(self, message, status_code=None, payload=None):
+ super().__init__()
+ self.message = message
+ if status_code is not None:
+ self.status_code = status_code
+ self.payload = payload
+
+ def to_dict(self):
+ rv = dict(self.payload or ())
+ rv["message"] = self.message
+ return rv
+
+
+@bp.errorhandler(InvalidParameters)
+def invalid_parameters(e):
+ return jsonify(e.to_dict()), e.status_code
+
+
@bp.route("/", methods=("GET",))
def parse_page():
- link = "https://www.ouest-france.fr/bretagne/rennes-35000/"
- feed = scrape(link)
+ args = dict(request.args)
+
+ # Checking if mandatory parameters are present
+ if not args.get("url"):
+ raise InvalidParameters("Missing parameter: URL")
+
+ if not args.get("title"):
+ raise InvalidParameters("Missing parameter: title")
+
+ if not args.get("article"):
+ raise InvalidParameters("Missing parameter: article")
+
+ # Checking for correctness
+ if not args.get("url").startswith("https"):
+ args["url"] = "https://" + args.get("url")
+ if not validators.url(args.get("url")):
+ raise InvalidParameters("Incorrect URL")
+
+ feed = scrape(args)
rss_xml = render_template("rss.xml", feed=feed, build_date=datetime.now())
response = make_response(rss_xml)
- response.headers['Content-Type'] = "application/rss+xml"
- return response
\ No newline at end of file
+ response.headers["Content-Type"] = "application/rss+xml"
+ return response
diff --git a/api/api/scraper.py b/api/api/scraper.py
index a845354..15325fb 100644
--- a/api/api/scraper.py
+++ b/api/api/scraper.py
@@ -5,16 +5,21 @@ import dateparser
class FeedItem:
- def __init__(self, title, content, author, link, item_datetime=datetime.now()):
+ def __init__(self, title, content, link, item_datetime=None, author=None):
self.title = title
self.content = content
- self.author = author
self.link = link
- self.item_datetime = item_datetime.isoformat()
+ self.author = author
+ if item_datetime:
+ self.item_datetime = item_datetime.isoformat()
+ else:
+ self.item_datetime = None
def __lt__(self, other):
if self.item_datetime and other.item_datetime:
return self.item_datetime < other.item_datetime
+ elif not self.title or not other.title:
+ return True
else:
return self.title < other.title
@@ -27,37 +32,82 @@ class Feed:
@bt.request(output=None)
-def scrape(request, link):
- soup = request.bs4(link)
- section = soup.find("section", {"class": "liste-articles"})
- articles = section.find_all("article", {"class": "teaser-media-liste"})
+def scrape(request, args):
+ soup = request.bs4(args.get("url"), timeout=5)
- feed = Feed(title=soup.title.get_text(), url=link, items=[])
+ # If section is provided, use it, else use the entire page/soup
+ section = soup
+ if args.get("section"):
+ section_class = None
+ if args.get("sectionClass"):
+ section_class = {"class": args.get("sectionClass")}
+ section = soup.find(args.get("section"), section_class)
+
+ article_class = None
+ if args.get("articleClass"):
+ article_class = {"class": args.get("articleClass")}
+ articles = section.find_all(args.get("article"), article_class)
+
+ feed = Feed(title=soup.title.get_text(), url=args.get("url"), items=[])
for article in articles:
- title = article.find("h2")
+ title_class = None
+ if args.get("titleClass"):
+ title_class = {"class": args.get("titleClass")}
+ title = article.find(args.get("title"), title_class)
if title:
title = title.get_text()
- content = article.find("p")
- if content:
- content = content.get_text()
+ content_tag = "p"
+ content_class = None
+ if args.get("content"):
+ content_tag = args.get("content")
+ if args.get("contentClass"):
+ content_class = {"class": args.get("contentClass")}
+ paragraphs = article.find_all(content_tag, content_class)
+ content = ""
+ if paragraphs:
+ content = "
".join([p.get_text() for p in paragraphs])
- link = article.find("a", {"class": "titre-lien"})
+ link_tag = "a"
+ link_class = None
+ if args.get("link"):
+ link_tag = args.get("link")
+ if args.get("linkClass"):
+ link_class = {"class": args.get("linkClass")}
+ link = article.find(link_tag, link_class)
if link:
link = link["href"]
- item_datetime = article.find("time")
- if item_datetime:
- item_datetime = dateparser.parse(item_datetime["datetime"])
+ item_datetime = None
+ item_datetime_class = None
+ if args.get("datetime"):
+ if args.get("datetimeClass"):
+ item_datetime_class = {"class": args.get("datetimeClass")}
+ item_datetime = article.find(args.get("datetime"), item_datetime_class)
+ if item_datetime:
+ item_datetime = dateparser.parse(item_datetime["datetime"])
+
+ author = None
+ author_class = None
+ if args.get("author"):
+ if args.get("authorClass"):
+ author_class = {"class": args.get("authorClass")}
+ author = article.find(args.get("author"), author_class)
+
item = FeedItem(
title=title,
content=content,
- author="Ouest-France",
link=link,
item_datetime=item_datetime,
+ author=author
)
- feed.items.append(item)
+
+ if item.title is not None:
+ feed.items.append(item)
+
+ # Sort by datetime if any
+ if args.get("datetime"):
feed.items.sort(reverse=True)
return feed
diff --git a/api/api/templates/rss.xml b/api/api/templates/rss.xml
index 0eee2a9..964b83a 100644
--- a/api/api/templates/rss.xml
+++ b/api/api/templates/rss.xml
@@ -2,9 +2,9 @@
{{ feed.title }}
-
- {{ request.base_url }}
- A feed generated from {{feed.url}} with Rudibridge
+
+ {{ request.url }}
+ A feed generated from {{ feed.url }} with Rudibridge
{{ build_date.strftime("%a, %d %b %Y %T") }} +0000
{% for item in feed.items %}
-
@@ -21,6 +21,9 @@
{% if item.item_datetime %}
{{ item.item_datetime }}
{% endif %}
+ {% if item.author %}
+ {{ item.author }}
+ {% endif %}
{% endfor %}
diff --git a/api/poetry.lock b/api/poetry.lock
index ee25d29..38df1e9 100644
--- a/api/poetry.lock
+++ b/api/poetry.lock
@@ -1149,6 +1149,17 @@ brotli = ["brotli (==1.0.9)", "brotli (>=1.0.9)", "brotlicffi (>=0.8.0)", "brotl
secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress", "pyOpenSSL (>=0.14)", "urllib3-secure-extra"]
socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"]
+[[package]]
+name = "validators"
+version = "0.28.1"
+description = "Python Data Validation for Humans™"
+optional = false
+python-versions = ">=3.8"
+files = [
+ {file = "validators-0.28.1-py3-none-any.whl", hash = "sha256:890c98789ad884037f059af6ea915ec2d667129d509180c2c590b8009a4c4219"},
+ {file = "validators-0.28.1.tar.gz", hash = "sha256:5ac88e7916c3405f0ce38ac2ac82a477fcf4d90dbbeddd04c8193171fc17f7dc"},
+]
+
[[package]]
name = "werkzeug"
version = "3.0.2"
@@ -1183,4 +1194,4 @@ h11 = ">=0.9.0,<1"
[metadata]
lock-version = "2.0"
python-versions = "^3.11"
-content-hash = "570ed3ea32a7c870f94ad73cdc122034842353bbef4e7c6111dcd150f79394ac"
+content-hash = "d0e30c9d7e8186fec20b1716f1a47e7e759e26cd68c5af13bd1ef4b9259426dd"
diff --git a/api/pyproject.toml b/api/pyproject.toml
index 5e9b18d..8c26b42 100644
--- a/api/pyproject.toml
+++ b/api/pyproject.toml
@@ -15,6 +15,7 @@ coverage = "^7.5.1"
requests = "^2.31.0"
botasaurus = "^4.0.14"
dateparser = "^1.2.0"
+validators = "^0.28.1"
[build-system]
diff --git a/api/tests/conftest.py b/api/tests/conftest.py
new file mode 100644
index 0000000..db3bacf
--- /dev/null
+++ b/api/tests/conftest.py
@@ -0,0 +1,15 @@
+import pytest
+from api import create_app
+
+@pytest.fixture()
+def app():
+ app = create_app()
+ app.config.update({
+ "TESTING": True
+ })
+
+ yield app
+
+@pytest.fixture()
+def client(app):
+ return app.test_client()
\ No newline at end of file
diff --git a/api/tests/test_parameters.py b/api/tests/test_parameters.py
new file mode 100644
index 0000000..885a846
--- /dev/null
+++ b/api/tests/test_parameters.py
@@ -0,0 +1,35 @@
+import pytest
+
+
+@pytest.mark.parametrize(
+ "url,missing_parameter",
+ [
+ ("/feed/", "URL"),
+ ("/feed/?url=https://mozilla.org", "title"),
+ ("/feed/?url=https://mozilla.org&title=h2", "article"),
+ ("/feed/?url=https://mozilla.org&title=h2&article=article", None),
+ ],
+)
+def test_missing_parameters(client, url, missing_parameter):
+ response = client.get(url)
+
+ if missing_parameter:
+ assert response.json["message"] == f"Missing parameter: {missing_parameter}"
+ assert response.status_code == 400
+ else:
+ assert response.status_code == 200
+
+
+@pytest.mark.parametrize(
+ "url,status_code,message",
+ [
+ ("https://mozilla.org", 200, None),
+ ("mozilla.org", 200, None),
+ ("toto", 400, "Incorrect URL"),
+ ],
+)
+def test_incorrect_url(client, url, status_code, message):
+ response = client.get(f"/feed/?url={url}&title=h2&article=article")
+ assert response.status_code == status_code
+ if message:
+ assert response.json["message"] == message