From 8e798dde48a7f8bbfa55658f5f18783e13769eb1 Mon Sep 17 00:00:00 2001 From: Aeledfyr <45501007+Aeledfyr@users.noreply.github.com> Date: Fri, 31 Jul 2020 20:18:23 -0500 Subject: [PATCH] feat(search): Faceted search based on mod host (curse/modrinth) (#48) This also adds a commandline argument library (gumdrop) for dealing with indices - reseting, reconfiguring, and skipping them. I don't know which library is best for this case, but gumdrop has shorter compile times and many fewer dependencies than clap, which is why I chose it. --- Cargo.lock | 21 +++++++ Cargo.toml | 1 + src/main.rs | 41 ++++++++++-- src/routes/mod_creation.rs | 4 +- src/scheduler.rs | 4 +- src/search/indexing/curseforge_import.rs | 4 +- src/search/indexing/local_import.rs | 4 +- src/search/indexing/mod.rs | 79 ++++++++++++++++++++++-- src/search/mod.rs | 5 ++ 9 files changed, 147 insertions(+), 16 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9efe6319..cf2acf21 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -903,6 +903,26 @@ version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bcc8e0c9bce37868955864dbecd2b1ab2bdf967e6f28066d65aaac620444b65c" +[[package]] +name = "gumdrop" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46571f5d540478cf70d2a42dd0d6d8e9f4b9cc7531544b93311e657b86568a0b" +dependencies = [ + "gumdrop_derive", +] + +[[package]] +name = "gumdrop_derive" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "915ef07c710d84733522461de2a734d4d62a3fd39a4d4f404c2f385ef8618d05" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "h2" version = "0.2.5" @@ -1137,6 +1157,7 @@ dependencies = [ "env_logger", "futures", "futures-timer", + "gumdrop", "log", "meilisearch-sdk", "rand", diff --git a/Cargo.toml b/Cargo.toml index 89d51604..947f805c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,6 +24,7 @@ serde = { version = "1.0", features = ["derive"] } chrono = { version = "0.4", features = ["serde"] } rand = "0.7" +gumdrop = "0.8" dotenv = "0.15" log = "0.4.8" env_logger = "0.7.1" diff --git a/src/main.rs b/src/main.rs index 5e66c3a0..e6a4086c 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,6 +1,7 @@ use actix_web::middleware::Logger; use actix_web::{web, App, HttpServer}; use env_logger::Env; +use gumdrop::Options; use log::{info, warn}; use search::indexing::index_mods; use search::indexing::IndexingSettings; @@ -13,11 +14,26 @@ mod routes; mod scheduler; mod search; +#[derive(Debug, Options)] +struct Config { + #[options(help = "Print help message")] + help: bool, + + #[options(no_short, help = "Skip indexing on startup")] + skip_first_index: bool, + #[options(no_short, help = "Reset the settings of the indices")] + reconfigure_indices: bool, + #[options(no_short, help = "Reset the documents in the indices")] + reset_indices: bool, +} + #[actix_rt::main] async fn main() -> std::io::Result<()> { dotenv::dotenv().ok(); env_logger::from_env(Env::default().default_filter_or("info")).init(); + let config = Config::parse_args_default_or_exit(); + check_env_vars(); // Database Connector @@ -43,10 +59,17 @@ async fn main() -> std::io::Result<()> { Arc::new(file_hosting::MockHost::new()) }; - // TODO: use a real arg parsing library - let skip_initial = std::env::args().any(|x| x == "skip"); + if config.reset_indices { + info!("Resetting indices"); + search::indexing::reset_indices().await.unwrap(); + } else if config.reconfigure_indices { + info!("Reconfiguring indices"); + search::indexing::reconfigure_indices().await.unwrap(); + } + // Allow manually skipping the initial indexing for quicker iteration // and startup times. + let skip_initial = config.skip_first_index; if skip_initial { info!("Skipping initial indexing"); } @@ -66,9 +89,12 @@ async fn main() -> std::io::Result<()> { let mut skip = skip_initial; scheduler.run(local_index_interval, move || { let pool_ref = pool_ref.clone(); + let local_skip = skip; + if skip { + skip = false; + } async move { - if skip { - skip = false; + if local_skip { return; } info!("Indexing local database"); @@ -90,9 +116,12 @@ async fn main() -> std::io::Result<()> { let mut skip = skip_initial; scheduler.run(std::time::Duration::from_secs(15 * 60), move || { let queue = queue_ref.clone(); + let local_skip = skip; + if skip { + skip = false; + } async move { - if skip { - skip = false; + if local_skip { return; } info!("Indexing created mod queue"); diff --git a/src/routes/mod_creation.rs b/src/routes/mod_creation.rs index 355988ff..1558d40b 100644 --- a/src/routes/mod_creation.rs +++ b/src/routes/mod_creation.rs @@ -11,6 +11,7 @@ use actix_web::{post, HttpResponse}; use futures::stream::StreamExt; use serde::{Deserialize, Serialize}; use sqlx::postgres::PgPool; +use std::borrow::Cow; use std::sync::Arc; use thiserror::Error; @@ -416,7 +417,8 @@ async fn mod_create_inner( // TODO: store and return modified time date_modified: formatted, modified_timestamp: timestamp, - empty: std::borrow::Cow::Borrowed("{}{}{}"), + host: Cow::Borrowed("modrinth"), + empty: Cow::Borrowed("{}{}{}"), }; indexing_queue.add(index_mod); diff --git a/src/scheduler.rs b/src/scheduler.rs index 8f6746d4..5714eadf 100644 --- a/src/scheduler.rs +++ b/src/scheduler.rs @@ -13,9 +13,9 @@ impl Scheduler { } } - pub fn run(&mut self, interval: std::time::Duration, task: F) + pub fn run(&mut self, interval: std::time::Duration, mut task: F) where - F: Fn() -> R + Send + 'static, + F: FnMut() -> R + Send + 'static, R: std::future::Future + Send + 'static, { let future = time::interval(interval).for_each_concurrent(2, move |_| task()); diff --git a/src/search/indexing/curseforge_import.rs b/src/search/indexing/curseforge_import.rs index 3ae5f73f..895df0d0 100644 --- a/src/search/indexing/curseforge_import.rs +++ b/src/search/indexing/curseforge_import.rs @@ -2,6 +2,7 @@ use super::IndexingError; use crate::search::UploadSearchMod; use log::info; use serde::{Deserialize, Serialize}; +use std::borrow::Cow; #[derive(Serialize, Deserialize, Debug)] #[serde(rename_all = "camelCase")] @@ -200,7 +201,8 @@ pub async fn index_curseforge( date_modified: modified.to_string(), modified_timestamp: modified.timestamp(), latest_version, - empty: std::borrow::Cow::Borrowed("{}{}{}"), + host: Cow::Borrowed("curseforge"), + empty: Cow::Borrowed("{}{}{}"), }) } diff --git a/src/search/indexing/local_import.rs b/src/search/indexing/local_import.rs index 7a36d618..b6155a7a 100644 --- a/src/search/indexing/local_import.rs +++ b/src/search/indexing/local_import.rs @@ -4,6 +4,7 @@ use log::info; use super::IndexingError; use crate::search::UploadSearchMod; use sqlx::postgres::PgPool; +use std::borrow::Cow; pub async fn index_local(pool: PgPool) -> Result, IndexingError> { info!("Indexing local mods!"); @@ -71,7 +72,8 @@ pub async fn index_local(pool: PgPool) -> Result, IndexingE date_modified: formatted, modified_timestamp: timestamp, latest_version: "".to_string(), // TODO: Info about latest version - empty: std::borrow::Cow::Borrowed("{}{}{}"), + host: Cow::Borrowed("modrinth"), + empty: Cow::Borrowed("{}{}{}"), }); } } diff --git a/src/search/indexing/mod.rs b/src/search/indexing/mod.rs index b4cf4636..322fbb45 100644 --- a/src/search/indexing/mod.rs +++ b/src/search/indexing/mod.rs @@ -72,6 +72,76 @@ pub async fn index_mods(pool: PgPool, settings: IndexingSettings) -> Result<(), Ok(()) } +pub async fn reset_indices() -> Result<(), IndexingError> { + let address = &*dotenv::var("MEILISEARCH_ADDR")?; + let client = Client::new(address, ""); + + client.delete_index("relevance_mods").await?; + client.delete_index("downloads_mods").await?; + client.delete_index("updated_mods").await?; + client.delete_index("newest_mods").await?; + Ok(()) +} + +pub async fn reconfigure_indices() -> Result<(), IndexingError> { + let address = &*dotenv::var("MEILISEARCH_ADDR")?; + let client = Client::new(address, ""); + + // Relevance Index + update_index(&client, "relevance_mods", { + let mut relevance_rules = default_rules(); + relevance_rules.push_back("desc(downloads)".to_string()); + relevance_rules.into() + }) + .await?; + + // Downloads Index + update_index(&client, "downloads_mods", { + let mut downloads_rules = default_rules(); + downloads_rules.push_front("desc(downloads)".to_string()); + downloads_rules.into() + }) + .await?; + + // Updated Index + update_index(&client, "updated_mods", { + let mut updated_rules = default_rules(); + updated_rules.push_front("desc(modified_timestamp)".to_string()); + updated_rules.into() + }) + .await?; + + // Created Index + update_index(&client, "newest_mods", { + let mut newest_rules = default_rules(); + newest_rules.push_front("desc(created_timestamp)".to_string()); + newest_rules.into() + }) + .await?; + + Ok(()) +} + +async fn update_index<'a>( + client: &'a Client<'a>, + name: &'a str, + rules: Vec, +) -> Result, IndexingError> { + let index = match client.get_index(name).await { + Ok(index) => index, + Err(meilisearch_sdk::errors::Error::IndexNotFound) => { + client.create_index(name, Some("mod_id")).await? + } + Err(e) => { + return Err(IndexingError::IndexDBError(e)); + } + }; + index + .set_settings(&default_settings().with_ranking_rules(rules)) + .await?; + Ok(index) +} + async fn create_index<'a>( client: &'a Client<'a>, name: &'a str, @@ -129,7 +199,7 @@ pub async fn add_mods(mods: Vec) -> Result<(), IndexingError> { // Updated Index let updated_index = create_index(&client, "updated_mods", || { let mut updated_rules = default_rules(); - updated_rules.push_front("desc(updated)".to_string()); + updated_rules.push_front("desc(modified_timestamp)".to_string()); updated_rules.into() }) .await?; @@ -138,7 +208,7 @@ pub async fn add_mods(mods: Vec) -> Result<(), IndexingError> { // Created Index let newest_index = create_index(&client, "newest_mods", || { let mut newest_rules = default_rules(); - newest_rules.push_front("desc(created)".to_string()); + newest_rules.push_front("desc(created_timestamp)".to_string()); newest_rules.into() }) .await?; @@ -173,10 +243,9 @@ fn default_settings() -> Settings { "icon_url".to_string(), "author_url".to_string(), "date_created".to_string(), - "created".to_string(), "date_modified".to_string(), - "updated".to_string(), "latest_version".to_string(), + "host".to_string(), ]; let searchable_attributes = vec![ @@ -194,7 +263,7 @@ fn default_settings() -> Settings { .with_accept_new_fields(true) .with_stop_words(vec![]) .with_synonyms(HashMap::new()) - .with_attributes_for_faceting(vec![String::from("categories")]) + .with_attributes_for_faceting(vec![String::from("categories"), String::from("host")]) } //endregion diff --git a/src/search/mod.rs b/src/search/mod.rs index e2be3cfd..99796921 100644 --- a/src/search/mod.rs +++ b/src/search/mod.rs @@ -71,6 +71,8 @@ pub struct UploadSearchMod { /// Unix timestamp of the last major modification pub modified_timestamp: i64, + pub host: Cow<'static, str>, + /// Must be "{}{}{}", a hack until meilisearch supports searches /// with empty queries (https://github.com/meilisearch/MeiliSearch/issues/729) // This is a Cow to prevent unnecessary allocations for a static @@ -96,6 +98,9 @@ pub struct ResultSearchMod { /// RFC 3339 formatted modification date of the mod pub date_modified: String, pub latest_version: String, + + /// The host of the mod: Either `modrinth` or `curseforge` + pub host: String, } impl Document for UploadSearchMod {