feat: first somewhat (hardcoded) working version

Works only for Ouest-France, need to generalize now.
This commit is contained in:
Ewen 2024-05-06 22:19:18 +02:00
parent d21ad550d9
commit 8aebaca9ad
17 changed files with 1446 additions and 0 deletions

2
.env.dev Normal file
View file

@ -0,0 +1,2 @@
FLASK_SECRET_KEY="dev"
FLASK_DEBUG=True

25
api/Dockerfile Normal file
View file

@ -0,0 +1,25 @@
FROM python:3.11-alpine as requirements
RUN apk update \
&& apk add --no-cache \
build-base \
chromium \
gcc \
libc-dev \
linux-headers \
pipx \
python3-dev \
&& pipx install poetry
ENV PATH=/root/.local/bin:${PATH}
RUN mkdir /app
COPY pyproject.toml poetry.lock /app
WORKDIR /app
RUN poetry --version
RUN /root/.local/bin/poetry install
#CMD ["poetry", "run", "gunicorn", "--bind=0.0.0.0:8080", "--reload", "app:app"]
CMD ["poetry", "run", "flask", "--app", "api", "run", "--host=0.0.0.0", "--port=8080", "--debug"]

2
api/README.md Normal file
View file

@ -0,0 +1,2 @@
## Rudibridge
A rudimentary RSS bridge for changing web pages that don't have an RSS feed.

32
api/api/__init__.py Normal file
View file

@ -0,0 +1,32 @@
import os
from flask import Flask
def create_app(test_config=None):
app = Flask(__name__, instance_relative_config=True)
app.config.from_mapping(
#!FIXME: secret key to change for production
SECRET_KEY="dev",
DATABASE=os.path.join(app.instance_path, "db.sqlite"),
)
if test_config is None:
app.config.from_prefixed_env()
else:
app.config.from_mapping(test_config)
try:
os.makedirs(app.instance_path)
except OSError:
pass
from . import db
db.init_app(app)
from . import feed
app.register_blueprint(feed.bp)
return app

11
api/api/app.py Normal file
View file

@ -0,0 +1,11 @@
from flask import Flask
app = Flask(__name__)
@app.route("/")
def hello():
return "<p>Coucou.</p>"
if __name__ == "__main__":
app.run()

0
api/api/config.py Normal file
View file

38
api/api/db.py Normal file
View file

@ -0,0 +1,38 @@
import sqlite3
import click
from flask import current_app, g
def get_db():
if "db" not in g:
g.db = sqlite3.connect(
current_app.config["DATABASE"], detect_types=sqlite3.PARSE_DECLTYPES
)
g.db.row_factory = sqlite3.Row
return g.db
def close_db(e=None):
db = g.pop("db", None)
if db is not None:
db.close()
def init_db():
db = get_db()
with current_app.open_resource("schema.sql") as f:
db.executescript(f.read().decode("utf8"))
@click.command("init-db")
def init_db_command():
init_db()
click.echo("Initialized the database.")
def init_app(app):
app.teardown_appcontext(close_db)
app.cli.add_command(init_db_command)

19
api/api/feed.py Normal file
View file

@ -0,0 +1,19 @@
from datetime import datetime
from flask import Blueprint, make_response, render_template
from api.db import get_db
from .scraper import scrape
bp = Blueprint("feed", __name__, url_prefix="/feed")
@bp.route("/", methods=("GET",))
def parse_page():
link = "https://www.ouest-france.fr/bretagne/rennes-35000/"
feed = scrape(link)
rss_xml = render_template("rss.xml", feed=feed, build_date=datetime.now())
response = make_response(rss_xml)
response.headers['Content-Type'] = "application/rss+xml"
return response

6
api/api/schema.sql Normal file
View file

@ -0,0 +1,6 @@
DROP TABLE IF EXISTS feed;
CREATE TABLE feed (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url TEXT NOT NULL
);

63
api/api/scraper.py Normal file
View file

@ -0,0 +1,63 @@
from datetime import datetime
import botasaurus as bt
import dateparser
class FeedItem:
def __init__(self, title, content, author, link, item_datetime=datetime.now()):
self.title = title
self.content = content
self.author = author
self.link = link
self.item_datetime = item_datetime.isoformat()
def __lt__(self, other):
if self.item_datetime and other.item_datetime:
return self.item_datetime < other.item_datetime
else:
return self.title < other.title
class Feed:
def __init__(self, title, url, items):
self.title = title
self.url = url
self.items = items
@bt.request(output=None)
def scrape(request, link):
soup = request.bs4(link)
section = soup.find("section", {"class": "liste-articles"})
articles = section.find_all("article", {"class": "teaser-media-liste"})
feed = Feed(title=soup.title.get_text(), url=link, items=[])
for article in articles:
title = article.find("h2")
if title:
title = title.get_text()
content = article.find("p")
if content:
content = content.get_text()
link = article.find("a", {"class": "titre-lien"})
if link:
link = link["href"]
item_datetime = article.find("time")
if item_datetime:
item_datetime = dateparser.parse(item_datetime["datetime"])
item = FeedItem(
title=title,
content=content,
author="Ouest-France",
link=link,
item_datetime=item_datetime,
)
feed.items.append(item)
feed.items.sort(reverse=True)
return feed

27
api/api/templates/rss.xml Normal file
View file

@ -0,0 +1,27 @@
<?xml version="1.0" encoding="utf-8"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
<channel>
<title>{{ feed.title }}</title>
<atom:link href="{{ request.base_url }}" rel="self" type="application/rss+xml"/>
<link>{{ request.base_url }}</link>
<description>A feed generated from {{feed.url}} with Rudibridge</description>
<lastBuildDate>{{ build_date.strftime("%a, %d %b %Y %T") }} +0000</lastBuildDate>
{% for item in feed.items %}
<item>
{% if item.title %}
<title>{{ item.title }}</title>
{% endif %}
{% if item.link %}
<link>{{ item.link }}</link>
<guid>{{ item.link }}</guid>
{% endif %}
{% if item.content %}
<description>{{ item.content }}</description>
{% endif %}
{% if item.item_datetime %}
<pubDate>{{ item.item_datetime }}</pubDate>
{% endif %}
</item>
{% endfor %}
</channel>
</rss>

1
api/local_storage.json Normal file
View file

@ -0,0 +1 @@
{}

1186
api/poetry.lock generated Normal file

File diff suppressed because it is too large Load diff

1
api/profiles.json Normal file
View file

@ -0,0 +1 @@
{}

22
api/pyproject.toml Normal file
View file

@ -0,0 +1,22 @@
[tool.poetry]
name = "api"
version = "0.1.0"
description = "The API behind Rudibridge"
authors = ["Ewen <darempred@korr.bzh>"]
readme = "README.md"
[tool.poetry.dependencies]
python = "^3.11"
flask = "^3.0.3"
gunicorn = "^22.0.0"
flask-sqlalchemy = "^3.1.1"
pytest = "^8.2.0"
coverage = "^7.5.1"
requests = "^2.31.0"
botasaurus = "^4.0.14"
dateparser = "^1.2.0"
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"

0
api/tests/__init__.py Normal file
View file

11
docker-compose.yml Normal file
View file

@ -0,0 +1,11 @@
services:
api:
env_file:
- .env.dev
build:
context: ./api
dockerfile: Dockerfile
ports:
- 8080:8080
volumes:
- ./api:/app