tweak the fetching and posting code

This commit is contained in:
Joe 2025-12-14 12:55:02 -08:00
parent 4434a31c09
commit b39612e969
5 changed files with 278 additions and 139 deletions

70
Cargo.lock generated
View file

@ -89,6 +89,28 @@ dependencies = [
"windows-sys 0.61.2",
]
[[package]]
name = "async-channel"
version = "1.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "81953c529336010edd6d8e358f886d9581267795c61b19475b71314bffa46d35"
dependencies = [
"concurrent-queue",
"event-listener 2.5.3",
"futures-core",
]
[[package]]
name = "async-trait"
version = "0.1.89"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.111",
]
[[package]]
name = "atoi"
version = "2.0.0"
@ -209,6 +231,7 @@ dependencies = [
"feed-rs",
"html2md",
"justerror",
"moro",
"rand 0.9.2",
"reqwest",
"serde",
@ -535,6 +558,12 @@ dependencies = [
"windows-sys 0.48.0",
]
[[package]]
name = "event-listener"
version = "2.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0"
[[package]]
name = "event-listener"
version = "5.4.1"
@ -633,6 +662,21 @@ dependencies = [
"new_debug_unreachable",
]
[[package]]
name = "futures"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876"
dependencies = [
"futures-channel",
"futures-core",
"futures-executor",
"futures-io",
"futures-sink",
"futures-task",
"futures-util",
]
[[package]]
name = "futures-channel"
version = "0.3.31"
@ -677,6 +721,17 @@ version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6"
[[package]]
name = "futures-macro"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.111",
]
[[package]]
name = "futures-sink"
version = "0.3.31"
@ -695,8 +750,10 @@ version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81"
dependencies = [
"futures-channel",
"futures-core",
"futures-io",
"futures-macro",
"futures-sink",
"futures-task",
"memchr",
@ -1379,6 +1436,17 @@ dependencies = [
"windows-sys 0.61.2",
]
[[package]]
name = "moro"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8472c674b8319e7529bfdb3c51216810e36727be2056136d07130a0b1c132df6"
dependencies = [
"async-channel",
"async-trait",
"futures",
]
[[package]]
name = "native-tls"
version = "0.2.14"
@ -2179,7 +2247,7 @@ dependencies = [
"crc",
"crossbeam-queue",
"either",
"event-listener",
"event-listener 5.4.1",
"futures-core",
"futures-intrusive",
"futures-io",

View file

@ -10,6 +10,7 @@ clap = { version = "4.5.53", features = ["derive"] }
feed-rs = { version = "2.3.1", features = ["sanitize"] }
html2md = "0.2.15"
justerror = "1.1.0"
moro = "0.4.0"
reqwest = "0.12.24"
serde = { version = "1.0.228", features = ["derive"] }
serde_urlencoded = "0.7.1"

View file

@ -1 +1 @@
DROP TABLE IF EXISTS runs;
DROP TABLE IF EXISTS successful_runs;

View file

@ -7,7 +7,7 @@ use sqlx::{
sqlite::{SqliteConnectOptions, SqliteJournalMode, SqlitePoolOptions},
types::chrono::{DateTime, Utc},
};
use tokio::task::{JoinHandle, JoinSet};
use tokio::task::JoinSet;
use tokio_util::{bytes::Buf, sync::CancellationToken};
use unicode_segmentation::UnicodeSegmentation;
@ -17,6 +17,9 @@ const MAX_CONNS: u32 = 200;
const MIN_CONNS: u32 = 5;
const TIMEOUT: u64 = 2000; // in milliseconds
const ZULIP_INTERVAL: Duration = Duration::from_millis(250);
const ZULIP_MESSAGE_CUTOFF: usize = 700;
const LAST_FETCHED: DateTime<Utc> = DateTime::from_timestamp_nanos(0);
const ONE_YEAR: Duration = Duration::from_secs(365 * 24 * 60 * 60);
@ -24,11 +27,16 @@ pub struct BlogdorTheAggregator {
db: SqlitePool,
client: reqwest::Client,
cancel: CancellationToken,
endpoint: String,
channel_id: u32,
email: String,
password: String,
}
#[derive(Debug, Default, Clone, PartialEq, Eq)]
pub struct FeedEntry {
url: String,
post_url: String,
feed_url: String,
feed_id: i64,
title: String,
published: DateTime<Utc>,
@ -37,7 +45,14 @@ pub struct FeedEntry {
body: Option<String>,
}
#[derive(Debug, Default, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
#[derive(Debug, Default, Clone, PartialEq, Eq)]
pub struct FeedResult {
pub entries: Option<Vec<FeedEntry>>,
pub url: String,
pub feed_id: i64,
}
#[derive(Debug, Default, Clone, PartialEq, Eq, serde::Serialize)]
struct ZulipMessage<'s> {
to: u32,
#[serde(rename = "type")]
@ -52,32 +67,123 @@ impl BlogdorTheAggregator {
let db = get_db_pool().await;
let client = reqwest::Client::new(); // TODO: retries?
let cancel = CancellationToken::new();
let endpoint = std::env::var("ZULIP_URL").expect("ZULIP_URL must be set");
let channel_id: u32 = std::env::var("ZULIP_CHANNEL")
.expect("ZULIP_CHANNEL must be set")
.parse()
.expect("ZULIP_CHANNEL must be an integer");
Self { db, client, cancel }
let email = std::env::var("BLOGDOR_EMAIL").expect("BLOGDOR_EMAIL must be set");
let password = std::env::var("ZULIP_TOKEN").expect("ZULIP_TOKEN must be set");
Self {
db,
client,
cancel,
endpoint,
channel_id,
email,
password,
}
}
pub async fn aggregate(&self) -> JoinHandle<()> {
let db = self.db.clone();
let client = self.client.clone();
let cancel = self.cancel.clone();
tokio::task::spawn(async move {
let mut alarm = tokio::time::interval(Duration::from_hours(1));
loop {
tokio::select! {
_ = alarm.tick() => {
check_feeds(&db, &client).await;
}
_ = cancel.cancelled() => {
tracing::info!("shutting down the aggregation loop");
break;
pub async fn cancelled(&self) {
self.cancel.cancelled().await
}
pub async fn spawn_http(&self) {
server::spawn_server(self.db.clone(), self.cancel.clone()).await;
}
pub async fn check_feeds(&self) -> Result<Vec<Result<FeedResult, String>>, String> {
tracing::debug!("checking feeds");
let feeds = sqlx::query!("select id, url from feeds where active = true")
.fetch_all(&self.db)
.await
.map_err(|e| format!("{e}"))?;
let mut handles = JoinSet::new();
for feed in feeds {
handles.spawn(check_feed(
self.db.clone(),
feed.id,
self.client.clone(),
feed.url,
));
}
let mut feed_results = Vec::new();
while let Some(feed_result) = handles.join_next().await {
let Ok(feed_result) = feed_result else {
let e = feed_result.unwrap_err();
tracing::error!("got join error: {e}");
continue;
};
feed_results.push(feed_result);
}
Ok(feed_results)
}
pub async fn post_entries(&self, posts: &[FeedEntry]) {
let FeedEntry {
feed_id, received, ..
} = posts.last().unwrap();
let mut success = true;
for post in posts.iter() {
let body = post
.body
.iter()
.next()
.cloned()
.unwrap_or("Blogdor Says: NO BODY!".to_string());
let content = format!(
"{body} ...\n\n---\noriginally posted to {}, on {}",
post.post_url, post.published
);
let msg = ZulipMessage {
to: self.channel_id,
typ: "stream",
content,
topic: Some(&post.title),
};
let msg = serde_urlencoded::to_string(msg).expect("serialize msg");
match self
.client
.post(&self.endpoint)
.basic_auth(&self.email, Some(&self.password))
.body(msg)
.header("Content-Type", "application/x-www-form-urlencoded")
.send()
.await
{
Err(e) => {
tracing::error!("got error sending to zulip: {e}");
success = false;
}
Ok(r) => {
if r.status() == StatusCode::OK {
success &= true;
} else {
tracing::warn!("did not successfully post to zulip: status {}", r.status());
success = false;
}
}
}
})
}
pub async fn listen_http(&self) -> JoinHandle<()> {
server::spawn_server(self.db.clone(), self.cancel.clone()).await
tokio::time::sleep(ZULIP_INTERVAL).await;
}
if success
&& let Err(e) = sqlx::query!(
"insert into successful_runs (feed, date_time) values (?, ?)",
feed_id,
received
)
.execute(&self.db)
.await
{
tracing::error!("could not insert run for {feed_id}, got {e}");
}
}
pub async fn close_db(&self) {
@ -85,106 +191,14 @@ impl BlogdorTheAggregator {
}
}
async fn check_feeds(db: &SqlitePool, client: &reqwest::Client) {
tracing::debug!("checking feeds");
let feeds = match sqlx::query!("select id, url from feeds where active = true")
.fetch_all(db)
.await
{
Ok(feeds) => feeds,
Err(e) => {
tracing::error!("got error fetching feeds from db: {e}");
return;
}
};
let endpoint = std::env::var("ZULIP_URL").expect("ZULIP_URL must be set");
let channel_id: u32 = std::env::var("ZULIP_CHANNEL")
.expect("ZULIP_CHANNEL must be set")
.parse()
.expect("ZULIP_CHANNEL must be an integer");
let email = std::env::var("BLOGDOR_EMAIL").expect("BLOGDOR_EMAIL must be set");
let password = std::env::var("ZULIP_TOKEN").expect("ZULIP_TOKEN must be set");
let mut handles = JoinSet::new();
for feed in feeds {
handles.spawn(check_feed(db.clone(), feed.id, client.clone(), feed.url));
}
while let Some(posts) = handles.join_next().await {
let Ok(posts) = posts else {
let e = posts.unwrap_err();
tracing::error!("got join error: {e}");
continue;
};
match posts {
Err(s) => {
tracing::warn!("could not fetch feed: {s}")
}
Ok(None) => {}
Ok(Some(posts)) => {
let FeedEntry { feed_id, .. } = posts.last().unwrap();
let mut success = true;
for post in posts.iter() {
let body = post
.body
.iter()
.next()
.cloned()
.unwrap_or("Blogdor Says: NO BODY!".to_string());
let content = format!(
"{body}\n\n---\noriginally posted to {}, on {}",
post.url, post.published
);
let msg = ZulipMessage {
to: channel_id,
typ: "stream",
content,
topic: Some(&post.title),
};
let msg = serde_urlencoded::to_string(msg).expect("serialize msg");
match client
.post(&endpoint)
.basic_auth(&email, Some(&password))
.body(msg)
.header("Content-Type", "application/x-www-form-urlencoded")
.send()
.await
{
Err(e) => {
tracing::error!("got error sending to zulip: {e}");
success = false;
}
Ok(r) => {
if r.status() == StatusCode::OK {
success &= true;
} else {
success = false;
}
}
}
tokio::time::sleep(Duration::from_millis(200)).await;
}
if success
&& let Err(e) =
sqlx::query!("insert into successful_runs (feed) values (?)", feed_id)
.execute(db)
.await
{
tracing::error!("could not insert run for {feed_id}, got {e}");
}
}
}
}
}
// takes args by value because it's meant to be called from inside a spawned
// tokio task scope
async fn check_feed(
db: SqlitePool,
feed_id: i64,
client: reqwest::Client,
url: String,
) -> Result<Option<Vec<FeedEntry>>, String> {
) -> Result<FeedResult, String> {
let rec = sqlx::query!(
"select date_time from successful_runs where feed = ? order by id desc limit 1",
feed_id
@ -196,7 +210,7 @@ async fn check_feed(
tracing::debug!("checking {url}");
let last_fetched = rec.map(|d| d.date_time.and_utc()).unwrap_or(LAST_FETCHED);
let now = Utc::now();
let mut out = Vec::new();
let mut entries = None;
let feed = client
.get(&url)
.send()
@ -211,13 +225,14 @@ async fn check_feed(
let last_year = now - ONE_YEAR;
if post.published.unwrap_or(last_year) > last_fetched {
let entry = FeedEntry {
url: post
post_url: post
.links
.first()
.cloned()
.map(|l| l.href)
.unwrap_or("".to_string()),
.unwrap_or("Blogdor Says: NO POST URL".to_string()),
feed_id,
feed_url: url.clone(),
title: post
.title
.map(|t| t.content)
@ -229,23 +244,21 @@ async fn check_feed(
c.body.map(|f| {
let s = html2md::parse_html(&f)
.graphemes(false)
.take(500)
.take(ZULIP_MESSAGE_CUTOFF)
.collect::<String>();
s.to_string()
})
}),
};
out.push(entry);
entries.get_or_insert(Vec::new()).push(entry);
}
}
if out.is_empty() {
tracing::debug!("no new items from {url}");
Ok(None)
} else {
tracing::debug!("found {} new items from {url}", out.len());
Ok(Some(out))
}
Ok(FeedResult {
entries,
url,
feed_id,
})
}
async fn get_db_pool() -> SqlitePool {

View file

@ -1,8 +1,24 @@
use std::time::Duration;
use blogdor::BlogdorTheAggregator;
use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt};
const BLOGDOR_SNOOZE: Duration = Duration::from_hours(1);
#[tokio::main(flavor = "multi_thread")]
async fn main() {
init_logs();
let bta = BlogdorTheAggregator::new().await;
bta.spawn_http().await;
run_loop(&bta).await;
bta.close_db().await;
tracing::info!("db closed, exiting");
}
fn init_logs() {
tracing_subscriber::registry()
.with(
tracing_subscriber::EnvFilter::try_from_default_env()
@ -10,11 +26,52 @@ async fn main() {
)
.with(tracing_subscriber::fmt::layer())
.init();
let bta = BlogdorTheAggregator::new().await;
let aggregator_handle = bta.aggregate().await;
let server_handle = bta.listen_http().await;
server_handle.await.unwrap_or_default();
aggregator_handle.await.unwrap_or_default();
bta.close_db().await;
}
async fn run_loop(bta: &BlogdorTheAggregator) {
let mut alarm = tokio::time::interval(BLOGDOR_SNOOZE);
moro::async_scope!(|scope| {
scope.spawn(async {
loop {
tokio::select! {
biased;
_ = alarm.tick() => {
match bta.check_feeds().await {
Ok(results) => {
for result in results {
match result {
Ok(result) => {
if let Some(ref posts) = result.entries {
tracing::debug!(
"got {} new posts from {}",
posts.len(),
result.url
);
bta.post_entries(posts).await;
} else {
tracing::debug!("no new posts from {}", result.url);
}
},
// inner error for singular feed
Err(e) => {
tracing::warn!("could not check feed: {e}");
},
}
}
},
// outer check_feeds error
Err(e) => {
tracing::warn!("could not check feeds: {e}");
}
}
}
_ = bta.cancelled() => {
tracing::info!("shutting down the aggregation loop");
break;
}
}
}
});
})
.await;
}