tweak the fetching and posting code
This commit is contained in:
parent
4434a31c09
commit
b39612e969
5 changed files with 278 additions and 139 deletions
70
Cargo.lock
generated
70
Cargo.lock
generated
|
|
@ -89,6 +89,28 @@ dependencies = [
|
|||
"windows-sys 0.61.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "async-channel"
|
||||
version = "1.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "81953c529336010edd6d8e358f886d9581267795c61b19475b71314bffa46d35"
|
||||
dependencies = [
|
||||
"concurrent-queue",
|
||||
"event-listener 2.5.3",
|
||||
"futures-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "async-trait"
|
||||
version = "0.1.89"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.111",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "atoi"
|
||||
version = "2.0.0"
|
||||
|
|
@ -209,6 +231,7 @@ dependencies = [
|
|||
"feed-rs",
|
||||
"html2md",
|
||||
"justerror",
|
||||
"moro",
|
||||
"rand 0.9.2",
|
||||
"reqwest",
|
||||
"serde",
|
||||
|
|
@ -535,6 +558,12 @@ dependencies = [
|
|||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "event-listener"
|
||||
version = "2.5.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0"
|
||||
|
||||
[[package]]
|
||||
name = "event-listener"
|
||||
version = "5.4.1"
|
||||
|
|
@ -633,6 +662,21 @@ dependencies = [
|
|||
"new_debug_unreachable",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "futures"
|
||||
version = "0.3.31"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876"
|
||||
dependencies = [
|
||||
"futures-channel",
|
||||
"futures-core",
|
||||
"futures-executor",
|
||||
"futures-io",
|
||||
"futures-sink",
|
||||
"futures-task",
|
||||
"futures-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "futures-channel"
|
||||
version = "0.3.31"
|
||||
|
|
@ -677,6 +721,17 @@ version = "0.3.31"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6"
|
||||
|
||||
[[package]]
|
||||
name = "futures-macro"
|
||||
version = "0.3.31"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.111",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "futures-sink"
|
||||
version = "0.3.31"
|
||||
|
|
@ -695,8 +750,10 @@ version = "0.3.31"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81"
|
||||
dependencies = [
|
||||
"futures-channel",
|
||||
"futures-core",
|
||||
"futures-io",
|
||||
"futures-macro",
|
||||
"futures-sink",
|
||||
"futures-task",
|
||||
"memchr",
|
||||
|
|
@ -1379,6 +1436,17 @@ dependencies = [
|
|||
"windows-sys 0.61.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "moro"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8472c674b8319e7529bfdb3c51216810e36727be2056136d07130a0b1c132df6"
|
||||
dependencies = [
|
||||
"async-channel",
|
||||
"async-trait",
|
||||
"futures",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "native-tls"
|
||||
version = "0.2.14"
|
||||
|
|
@ -2179,7 +2247,7 @@ dependencies = [
|
|||
"crc",
|
||||
"crossbeam-queue",
|
||||
"either",
|
||||
"event-listener",
|
||||
"event-listener 5.4.1",
|
||||
"futures-core",
|
||||
"futures-intrusive",
|
||||
"futures-io",
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ clap = { version = "4.5.53", features = ["derive"] }
|
|||
feed-rs = { version = "2.3.1", features = ["sanitize"] }
|
||||
html2md = "0.2.15"
|
||||
justerror = "1.1.0"
|
||||
moro = "0.4.0"
|
||||
reqwest = "0.12.24"
|
||||
serde = { version = "1.0.228", features = ["derive"] }
|
||||
serde_urlencoded = "0.7.1"
|
||||
|
|
|
|||
|
|
@ -1 +1 @@
|
|||
DROP TABLE IF EXISTS runs;
|
||||
DROP TABLE IF EXISTS successful_runs;
|
||||
|
|
|
|||
273
src/lib.rs
273
src/lib.rs
|
|
@ -7,7 +7,7 @@ use sqlx::{
|
|||
sqlite::{SqliteConnectOptions, SqliteJournalMode, SqlitePoolOptions},
|
||||
types::chrono::{DateTime, Utc},
|
||||
};
|
||||
use tokio::task::{JoinHandle, JoinSet};
|
||||
use tokio::task::JoinSet;
|
||||
use tokio_util::{bytes::Buf, sync::CancellationToken};
|
||||
use unicode_segmentation::UnicodeSegmentation;
|
||||
|
||||
|
|
@ -17,6 +17,9 @@ const MAX_CONNS: u32 = 200;
|
|||
const MIN_CONNS: u32 = 5;
|
||||
const TIMEOUT: u64 = 2000; // in milliseconds
|
||||
|
||||
const ZULIP_INTERVAL: Duration = Duration::from_millis(250);
|
||||
const ZULIP_MESSAGE_CUTOFF: usize = 700;
|
||||
|
||||
const LAST_FETCHED: DateTime<Utc> = DateTime::from_timestamp_nanos(0);
|
||||
const ONE_YEAR: Duration = Duration::from_secs(365 * 24 * 60 * 60);
|
||||
|
||||
|
|
@ -24,11 +27,16 @@ pub struct BlogdorTheAggregator {
|
|||
db: SqlitePool,
|
||||
client: reqwest::Client,
|
||||
cancel: CancellationToken,
|
||||
endpoint: String,
|
||||
channel_id: u32,
|
||||
email: String,
|
||||
password: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone, PartialEq, Eq)]
|
||||
pub struct FeedEntry {
|
||||
url: String,
|
||||
post_url: String,
|
||||
feed_url: String,
|
||||
feed_id: i64,
|
||||
title: String,
|
||||
published: DateTime<Utc>,
|
||||
|
|
@ -37,7 +45,14 @@ pub struct FeedEntry {
|
|||
body: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
#[derive(Debug, Default, Clone, PartialEq, Eq)]
|
||||
pub struct FeedResult {
|
||||
pub entries: Option<Vec<FeedEntry>>,
|
||||
pub url: String,
|
||||
pub feed_id: i64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone, PartialEq, Eq, serde::Serialize)]
|
||||
struct ZulipMessage<'s> {
|
||||
to: u32,
|
||||
#[serde(rename = "type")]
|
||||
|
|
@ -52,32 +67,123 @@ impl BlogdorTheAggregator {
|
|||
let db = get_db_pool().await;
|
||||
let client = reqwest::Client::new(); // TODO: retries?
|
||||
let cancel = CancellationToken::new();
|
||||
let endpoint = std::env::var("ZULIP_URL").expect("ZULIP_URL must be set");
|
||||
let channel_id: u32 = std::env::var("ZULIP_CHANNEL")
|
||||
.expect("ZULIP_CHANNEL must be set")
|
||||
.parse()
|
||||
.expect("ZULIP_CHANNEL must be an integer");
|
||||
|
||||
Self { db, client, cancel }
|
||||
let email = std::env::var("BLOGDOR_EMAIL").expect("BLOGDOR_EMAIL must be set");
|
||||
let password = std::env::var("ZULIP_TOKEN").expect("ZULIP_TOKEN must be set");
|
||||
|
||||
Self {
|
||||
db,
|
||||
client,
|
||||
cancel,
|
||||
endpoint,
|
||||
channel_id,
|
||||
email,
|
||||
password,
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn aggregate(&self) -> JoinHandle<()> {
|
||||
let db = self.db.clone();
|
||||
let client = self.client.clone();
|
||||
let cancel = self.cancel.clone();
|
||||
tokio::task::spawn(async move {
|
||||
let mut alarm = tokio::time::interval(Duration::from_hours(1));
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = alarm.tick() => {
|
||||
check_feeds(&db, &client).await;
|
||||
}
|
||||
_ = cancel.cancelled() => {
|
||||
tracing::info!("shutting down the aggregation loop");
|
||||
break;
|
||||
pub async fn cancelled(&self) {
|
||||
self.cancel.cancelled().await
|
||||
}
|
||||
|
||||
pub async fn spawn_http(&self) {
|
||||
server::spawn_server(self.db.clone(), self.cancel.clone()).await;
|
||||
}
|
||||
|
||||
pub async fn check_feeds(&self) -> Result<Vec<Result<FeedResult, String>>, String> {
|
||||
tracing::debug!("checking feeds");
|
||||
let feeds = sqlx::query!("select id, url from feeds where active = true")
|
||||
.fetch_all(&self.db)
|
||||
.await
|
||||
.map_err(|e| format!("{e}"))?;
|
||||
|
||||
let mut handles = JoinSet::new();
|
||||
for feed in feeds {
|
||||
handles.spawn(check_feed(
|
||||
self.db.clone(),
|
||||
feed.id,
|
||||
self.client.clone(),
|
||||
feed.url,
|
||||
));
|
||||
}
|
||||
|
||||
let mut feed_results = Vec::new();
|
||||
while let Some(feed_result) = handles.join_next().await {
|
||||
let Ok(feed_result) = feed_result else {
|
||||
let e = feed_result.unwrap_err();
|
||||
tracing::error!("got join error: {e}");
|
||||
continue;
|
||||
};
|
||||
feed_results.push(feed_result);
|
||||
}
|
||||
|
||||
Ok(feed_results)
|
||||
}
|
||||
|
||||
pub async fn post_entries(&self, posts: &[FeedEntry]) {
|
||||
let FeedEntry {
|
||||
feed_id, received, ..
|
||||
} = posts.last().unwrap();
|
||||
let mut success = true;
|
||||
for post in posts.iter() {
|
||||
let body = post
|
||||
.body
|
||||
.iter()
|
||||
.next()
|
||||
.cloned()
|
||||
.unwrap_or("Blogdor Says: NO BODY!".to_string());
|
||||
let content = format!(
|
||||
"{body} ...\n\n---\noriginally posted to {}, on {}",
|
||||
post.post_url, post.published
|
||||
);
|
||||
let msg = ZulipMessage {
|
||||
to: self.channel_id,
|
||||
typ: "stream",
|
||||
content,
|
||||
topic: Some(&post.title),
|
||||
};
|
||||
let msg = serde_urlencoded::to_string(msg).expect("serialize msg");
|
||||
|
||||
match self
|
||||
.client
|
||||
.post(&self.endpoint)
|
||||
.basic_auth(&self.email, Some(&self.password))
|
||||
.body(msg)
|
||||
.header("Content-Type", "application/x-www-form-urlencoded")
|
||||
.send()
|
||||
.await
|
||||
{
|
||||
Err(e) => {
|
||||
tracing::error!("got error sending to zulip: {e}");
|
||||
success = false;
|
||||
}
|
||||
Ok(r) => {
|
||||
if r.status() == StatusCode::OK {
|
||||
success &= true;
|
||||
} else {
|
||||
tracing::warn!("did not successfully post to zulip: status {}", r.status());
|
||||
success = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn listen_http(&self) -> JoinHandle<()> {
|
||||
server::spawn_server(self.db.clone(), self.cancel.clone()).await
|
||||
tokio::time::sleep(ZULIP_INTERVAL).await;
|
||||
}
|
||||
if success
|
||||
&& let Err(e) = sqlx::query!(
|
||||
"insert into successful_runs (feed, date_time) values (?, ?)",
|
||||
feed_id,
|
||||
received
|
||||
)
|
||||
.execute(&self.db)
|
||||
.await
|
||||
{
|
||||
tracing::error!("could not insert run for {feed_id}, got {e}");
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn close_db(&self) {
|
||||
|
|
@ -85,106 +191,14 @@ impl BlogdorTheAggregator {
|
|||
}
|
||||
}
|
||||
|
||||
async fn check_feeds(db: &SqlitePool, client: &reqwest::Client) {
|
||||
tracing::debug!("checking feeds");
|
||||
let feeds = match sqlx::query!("select id, url from feeds where active = true")
|
||||
.fetch_all(db)
|
||||
.await
|
||||
{
|
||||
Ok(feeds) => feeds,
|
||||
Err(e) => {
|
||||
tracing::error!("got error fetching feeds from db: {e}");
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
let endpoint = std::env::var("ZULIP_URL").expect("ZULIP_URL must be set");
|
||||
let channel_id: u32 = std::env::var("ZULIP_CHANNEL")
|
||||
.expect("ZULIP_CHANNEL must be set")
|
||||
.parse()
|
||||
.expect("ZULIP_CHANNEL must be an integer");
|
||||
|
||||
let email = std::env::var("BLOGDOR_EMAIL").expect("BLOGDOR_EMAIL must be set");
|
||||
let password = std::env::var("ZULIP_TOKEN").expect("ZULIP_TOKEN must be set");
|
||||
|
||||
let mut handles = JoinSet::new();
|
||||
for feed in feeds {
|
||||
handles.spawn(check_feed(db.clone(), feed.id, client.clone(), feed.url));
|
||||
}
|
||||
while let Some(posts) = handles.join_next().await {
|
||||
let Ok(posts) = posts else {
|
||||
let e = posts.unwrap_err();
|
||||
tracing::error!("got join error: {e}");
|
||||
continue;
|
||||
};
|
||||
match posts {
|
||||
Err(s) => {
|
||||
tracing::warn!("could not fetch feed: {s}")
|
||||
}
|
||||
Ok(None) => {}
|
||||
Ok(Some(posts)) => {
|
||||
let FeedEntry { feed_id, .. } = posts.last().unwrap();
|
||||
let mut success = true;
|
||||
for post in posts.iter() {
|
||||
let body = post
|
||||
.body
|
||||
.iter()
|
||||
.next()
|
||||
.cloned()
|
||||
.unwrap_or("Blogdor Says: NO BODY!".to_string());
|
||||
let content = format!(
|
||||
"{body}\n\n---\noriginally posted to {}, on {}",
|
||||
post.url, post.published
|
||||
);
|
||||
let msg = ZulipMessage {
|
||||
to: channel_id,
|
||||
typ: "stream",
|
||||
content,
|
||||
topic: Some(&post.title),
|
||||
};
|
||||
let msg = serde_urlencoded::to_string(msg).expect("serialize msg");
|
||||
|
||||
match client
|
||||
.post(&endpoint)
|
||||
.basic_auth(&email, Some(&password))
|
||||
.body(msg)
|
||||
.header("Content-Type", "application/x-www-form-urlencoded")
|
||||
.send()
|
||||
.await
|
||||
{
|
||||
Err(e) => {
|
||||
tracing::error!("got error sending to zulip: {e}");
|
||||
success = false;
|
||||
}
|
||||
Ok(r) => {
|
||||
if r.status() == StatusCode::OK {
|
||||
success &= true;
|
||||
} else {
|
||||
success = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
tokio::time::sleep(Duration::from_millis(200)).await;
|
||||
}
|
||||
if success
|
||||
&& let Err(e) =
|
||||
sqlx::query!("insert into successful_runs (feed) values (?)", feed_id)
|
||||
.execute(db)
|
||||
.await
|
||||
{
|
||||
tracing::error!("could not insert run for {feed_id}, got {e}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// takes args by value because it's meant to be called from inside a spawned
|
||||
// tokio task scope
|
||||
async fn check_feed(
|
||||
db: SqlitePool,
|
||||
feed_id: i64,
|
||||
client: reqwest::Client,
|
||||
url: String,
|
||||
) -> Result<Option<Vec<FeedEntry>>, String> {
|
||||
) -> Result<FeedResult, String> {
|
||||
let rec = sqlx::query!(
|
||||
"select date_time from successful_runs where feed = ? order by id desc limit 1",
|
||||
feed_id
|
||||
|
|
@ -196,7 +210,7 @@ async fn check_feed(
|
|||
tracing::debug!("checking {url}");
|
||||
let last_fetched = rec.map(|d| d.date_time.and_utc()).unwrap_or(LAST_FETCHED);
|
||||
let now = Utc::now();
|
||||
let mut out = Vec::new();
|
||||
let mut entries = None;
|
||||
let feed = client
|
||||
.get(&url)
|
||||
.send()
|
||||
|
|
@ -211,13 +225,14 @@ async fn check_feed(
|
|||
let last_year = now - ONE_YEAR;
|
||||
if post.published.unwrap_or(last_year) > last_fetched {
|
||||
let entry = FeedEntry {
|
||||
url: post
|
||||
post_url: post
|
||||
.links
|
||||
.first()
|
||||
.cloned()
|
||||
.map(|l| l.href)
|
||||
.unwrap_or("".to_string()),
|
||||
.unwrap_or("Blogdor Says: NO POST URL".to_string()),
|
||||
feed_id,
|
||||
feed_url: url.clone(),
|
||||
title: post
|
||||
.title
|
||||
.map(|t| t.content)
|
||||
|
|
@ -229,23 +244,21 @@ async fn check_feed(
|
|||
c.body.map(|f| {
|
||||
let s = html2md::parse_html(&f)
|
||||
.graphemes(false)
|
||||
.take(500)
|
||||
.take(ZULIP_MESSAGE_CUTOFF)
|
||||
.collect::<String>();
|
||||
s.to_string()
|
||||
})
|
||||
}),
|
||||
};
|
||||
out.push(entry);
|
||||
entries.get_or_insert(Vec::new()).push(entry);
|
||||
}
|
||||
}
|
||||
|
||||
if out.is_empty() {
|
||||
tracing::debug!("no new items from {url}");
|
||||
Ok(None)
|
||||
} else {
|
||||
tracing::debug!("found {} new items from {url}", out.len());
|
||||
Ok(Some(out))
|
||||
}
|
||||
Ok(FeedResult {
|
||||
entries,
|
||||
url,
|
||||
feed_id,
|
||||
})
|
||||
}
|
||||
|
||||
async fn get_db_pool() -> SqlitePool {
|
||||
|
|
|
|||
71
src/main.rs
71
src/main.rs
|
|
@ -1,8 +1,24 @@
|
|||
use std::time::Duration;
|
||||
|
||||
use blogdor::BlogdorTheAggregator;
|
||||
use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt};
|
||||
|
||||
const BLOGDOR_SNOOZE: Duration = Duration::from_hours(1);
|
||||
|
||||
#[tokio::main(flavor = "multi_thread")]
|
||||
async fn main() {
|
||||
init_logs();
|
||||
|
||||
let bta = BlogdorTheAggregator::new().await;
|
||||
bta.spawn_http().await;
|
||||
run_loop(&bta).await;
|
||||
|
||||
bta.close_db().await;
|
||||
|
||||
tracing::info!("db closed, exiting");
|
||||
}
|
||||
|
||||
fn init_logs() {
|
||||
tracing_subscriber::registry()
|
||||
.with(
|
||||
tracing_subscriber::EnvFilter::try_from_default_env()
|
||||
|
|
@ -10,11 +26,52 @@ async fn main() {
|
|||
)
|
||||
.with(tracing_subscriber::fmt::layer())
|
||||
.init();
|
||||
|
||||
let bta = BlogdorTheAggregator::new().await;
|
||||
let aggregator_handle = bta.aggregate().await;
|
||||
let server_handle = bta.listen_http().await;
|
||||
server_handle.await.unwrap_or_default();
|
||||
aggregator_handle.await.unwrap_or_default();
|
||||
bta.close_db().await;
|
||||
}
|
||||
|
||||
async fn run_loop(bta: &BlogdorTheAggregator) {
|
||||
let mut alarm = tokio::time::interval(BLOGDOR_SNOOZE);
|
||||
moro::async_scope!(|scope| {
|
||||
scope.spawn(async {
|
||||
loop {
|
||||
tokio::select! {
|
||||
biased;
|
||||
_ = alarm.tick() => {
|
||||
match bta.check_feeds().await {
|
||||
Ok(results) => {
|
||||
for result in results {
|
||||
match result {
|
||||
Ok(result) => {
|
||||
if let Some(ref posts) = result.entries {
|
||||
tracing::debug!(
|
||||
"got {} new posts from {}",
|
||||
posts.len(),
|
||||
result.url
|
||||
);
|
||||
bta.post_entries(posts).await;
|
||||
} else {
|
||||
tracing::debug!("no new posts from {}", result.url);
|
||||
}
|
||||
},
|
||||
// inner error for singular feed
|
||||
Err(e) => {
|
||||
tracing::warn!("could not check feed: {e}");
|
||||
},
|
||||
}
|
||||
}
|
||||
},
|
||||
// outer check_feeds error
|
||||
Err(e) => {
|
||||
tracing::warn!("could not check feeds: {e}");
|
||||
}
|
||||
}
|
||||
}
|
||||
_ = bta.cancelled() => {
|
||||
tracing::info!("shutting down the aggregation loop");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
})
|
||||
.await;
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue