Optimizations and treating paragraps as separate notes

This commit is contained in:
Natty 2024-03-29 14:43:54 +01:00
parent e4e8908788
commit 4723385751
Signed by: natty
GPG Key ID: BF6CB659ADEE60EC
1 changed files with 17 additions and 4 deletions

View File

@ -5,9 +5,9 @@ use futures::TryStreamExt;
use indicatif::ProgressBar;
use rand::distributions::{Distribution, WeightedIndex};
use regex::Regex;
use reqwest::{header, ClientBuilder, RequestBuilder, Response};
use reqwest::{header, ClientBuilder};
use serde::{Deserialize, Serialize};
use serde_json::{json, Value};
use serde_json::Value;
use sprs::{CsMat, TriMat};
use sqlx::postgres::PgPoolOptions;
@ -154,6 +154,14 @@ impl State {
.replace_all(input, "")
.to_string();
if input.contains("\n\n") {
let paragraph_split = Regex::new("\n\n+").unwrap();
paragraph_split
.split(&input)
.for_each(|p| self.insert_tokens(p));
return;
}
let regex = Regex::new(r"\s+").unwrap();
let nasty_words = vec![
@ -201,7 +209,6 @@ async fn main() {
} else {
State::new()
};
progress.disable_steady_tick();
let pool = PgPoolOptions::new()
.connect(std::env::var("DATABASE_URL").as_deref().unwrap())
@ -217,7 +224,8 @@ async fn main() {
r#"SELECT text, "createdAt"
FROM note
WHERE "note"."userId" = $1
AND "note"."createdAt" > $2
AND "note"."createdAt" > $2
AND "note"."createdAt" < NOW()
AND "note"."visibility" IN ('public', 'home')
AND ("note"."cw" IS NULL OR LOWER("note"."cw") IN ('', 'gay', 'cursed', 'what', 'shitpost', 'no', 'natty what', 'natty what the fuck'))"#,
"9awy7u3l76",
@ -239,6 +247,11 @@ async fn main() {
progress.set_length(cnt);
}
drop(stream);
drop(pool);
println!("Shape: {:?}", state.matrix.shape());
let file = File::create(path).unwrap();
progress.set_message("Saving data...");
rmp_serde::encode::write(&mut progress.wrap_write(file), &state).unwrap();