You've already forked AstralRinth
forked from didirus/AstralRinth
* testing push * lowers it * removed unwrap * reduced to 500 * Really took down time * reorders chunking * rebuild docker * reverted most changes * cargo fmt * reduced meilisearch limit * added logs, removed deletion of index * one client creation * changes * reverted gallery cahnge * testing re-splitting again * Remove chunking + index deletion * Bring back chunking * Update chunk size --------- Co-authored-by: Jai A <jaiagr+gpg@pm.me> Co-authored-by: Geometrically <18202329+Geometrically@users.noreply.github.com>
315 lines
9.0 KiB
Rust
315 lines
9.0 KiB
Rust
/// This module is used for the indexing from any source.
|
|
pub mod local_import;
|
|
|
|
use itertools::Itertools;
|
|
use std::collections::HashMap;
|
|
|
|
use crate::database::redis::RedisPool;
|
|
use crate::models::ids::base62_impl::to_base62;
|
|
use crate::search::{SearchConfig, UploadSearchProject};
|
|
use local_import::index_local;
|
|
use log::info;
|
|
use meilisearch_sdk::client::Client;
|
|
use meilisearch_sdk::indexes::Index;
|
|
use meilisearch_sdk::settings::{PaginationSetting, Settings};
|
|
use sqlx::postgres::PgPool;
|
|
use thiserror::Error;
|
|
|
|
use self::local_import::get_all_ids;
|
|
|
|
#[derive(Error, Debug)]
|
|
pub enum IndexingError {
|
|
#[error("Error while connecting to the MeiliSearch database")]
|
|
Indexing(#[from] meilisearch_sdk::errors::Error),
|
|
#[error("Error while serializing or deserializing JSON: {0}")]
|
|
Serde(#[from] serde_json::Error),
|
|
#[error("Database Error: {0}")]
|
|
Sqlx(#[from] sqlx::error::Error),
|
|
#[error("Database Error: {0}")]
|
|
Database(#[from] crate::database::models::DatabaseError),
|
|
#[error("Environment Error")]
|
|
Env(#[from] dotenvy::Error),
|
|
#[error("Error while awaiting index creation task")]
|
|
Task,
|
|
}
|
|
|
|
// The chunk size for adding projects to the indexing database. If the request size
|
|
// is too large (>10MiB) then the request fails with an error. This chunk size
|
|
// assumes a max average size of 4KiB per project to avoid this cap.
|
|
const MEILISEARCH_CHUNK_SIZE: usize = 2500; // Should be less than FETCH_PROJECT_SIZE
|
|
const FETCH_PROJECT_SIZE: usize = 5000;
|
|
|
|
const TIMEOUT: std::time::Duration = std::time::Duration::from_secs(60);
|
|
|
|
pub async fn remove_documents(
|
|
ids: &[crate::models::ids::VersionId],
|
|
config: &SearchConfig,
|
|
) -> Result<(), meilisearch_sdk::errors::Error> {
|
|
let indexes = get_indexes(config).await?;
|
|
|
|
for index in indexes {
|
|
index
|
|
.delete_documents(&ids.iter().map(|x| to_base62(x.0)).collect::<Vec<_>>())
|
|
.await?;
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
pub async fn index_projects(
|
|
pool: PgPool,
|
|
redis: RedisPool,
|
|
config: &SearchConfig,
|
|
) -> Result<(), IndexingError> {
|
|
info!("Indexing projects.");
|
|
|
|
let indices = get_indexes(config).await?;
|
|
|
|
let all_ids = get_all_ids(pool.clone()).await?;
|
|
let all_ids_len = all_ids.len();
|
|
info!("Got all ids, indexing {} projects", all_ids_len);
|
|
|
|
let mut so_far = 0;
|
|
let as_chunks: Vec<_> = all_ids
|
|
.into_iter()
|
|
.chunks(FETCH_PROJECT_SIZE)
|
|
.into_iter()
|
|
.map(|x| x.collect::<Vec<_>>())
|
|
.collect();
|
|
|
|
for id_chunk in as_chunks {
|
|
info!(
|
|
"Fetching chunk {}-{}/{}, size: {}",
|
|
so_far,
|
|
so_far + FETCH_PROJECT_SIZE,
|
|
all_ids_len,
|
|
id_chunk.len()
|
|
);
|
|
so_far += FETCH_PROJECT_SIZE;
|
|
|
|
let id_chunk = id_chunk
|
|
.into_iter()
|
|
.map(|(version_id, project_id, owner_username)| {
|
|
(version_id, (project_id, owner_username.to_lowercase()))
|
|
})
|
|
.collect::<HashMap<_, _>>();
|
|
let (uploads, loader_fields) = index_local(&pool, &redis, id_chunk).await?;
|
|
|
|
info!("Got chunk, adding to docs_to_add");
|
|
add_projects(&indices, uploads, loader_fields, config).await?;
|
|
}
|
|
|
|
info!("Done adding projects.");
|
|
Ok(())
|
|
}
|
|
|
|
pub async fn get_indexes(
|
|
config: &SearchConfig,
|
|
) -> Result<Vec<Index>, meilisearch_sdk::errors::Error> {
|
|
let client = config.make_client();
|
|
|
|
let projects_index = create_or_update_index(&client, "projects", None).await?;
|
|
let projects_filtered_index = create_or_update_index(
|
|
&client,
|
|
"projects_filtered",
|
|
Some(&[
|
|
"sort",
|
|
"words",
|
|
"typo",
|
|
"proximity",
|
|
"attribute",
|
|
"exactness",
|
|
]),
|
|
)
|
|
.await?;
|
|
|
|
Ok(vec![projects_index, projects_filtered_index])
|
|
}
|
|
|
|
async fn create_or_update_index(
|
|
client: &Client,
|
|
name: &'static str,
|
|
custom_rules: Option<&'static [&'static str]>,
|
|
) -> Result<Index, meilisearch_sdk::errors::Error> {
|
|
info!("Updating/creating index.");
|
|
|
|
match client.get_index(name).await {
|
|
Ok(index) => {
|
|
info!("Updating index settings.");
|
|
|
|
let mut settings = default_settings();
|
|
|
|
if let Some(custom_rules) = custom_rules {
|
|
settings = settings.with_ranking_rules(custom_rules);
|
|
}
|
|
|
|
index
|
|
.set_settings(&settings)
|
|
.await?
|
|
.wait_for_completion(client, None, Some(TIMEOUT))
|
|
.await?;
|
|
Ok(index)
|
|
}
|
|
_ => {
|
|
info!("Creating index.");
|
|
|
|
// Only create index and set settings if the index doesn't already exist
|
|
let task = client.create_index(name, Some("version_id")).await?;
|
|
let task = task
|
|
.wait_for_completion(client, None, Some(TIMEOUT))
|
|
.await?;
|
|
let index = task
|
|
.try_make_index(client)
|
|
.map_err(|x| x.unwrap_failure())?;
|
|
|
|
let mut settings = default_settings();
|
|
|
|
if let Some(custom_rules) = custom_rules {
|
|
settings = settings.with_ranking_rules(custom_rules);
|
|
}
|
|
|
|
index
|
|
.set_settings(&settings)
|
|
.await?
|
|
.wait_for_completion(client, None, Some(TIMEOUT))
|
|
.await?;
|
|
|
|
Ok(index)
|
|
}
|
|
}
|
|
}
|
|
|
|
async fn add_to_index(
|
|
client: &Client,
|
|
index: &Index,
|
|
mods: &[UploadSearchProject],
|
|
) -> Result<(), IndexingError> {
|
|
for chunk in mods.chunks(MEILISEARCH_CHUNK_SIZE) {
|
|
info!(
|
|
"Adding chunk starting with version id {}",
|
|
chunk[0].version_id
|
|
);
|
|
index
|
|
.add_or_replace(chunk, Some("version_id"))
|
|
.await?
|
|
.wait_for_completion(client, None, Some(std::time::Duration::from_secs(3600)))
|
|
.await?;
|
|
info!("Added chunk of {} projects to index", chunk.len());
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
async fn update_and_add_to_index(
|
|
client: &Client,
|
|
index: &Index,
|
|
projects: &[UploadSearchProject],
|
|
additional_fields: &[String],
|
|
) -> Result<(), IndexingError> {
|
|
let mut new_filterable_attributes: Vec<String> = index.get_filterable_attributes().await?;
|
|
let mut new_displayed_attributes = index.get_displayed_attributes().await?;
|
|
|
|
new_filterable_attributes.extend(additional_fields.iter().map(|s| s.to_string()));
|
|
new_displayed_attributes.extend(additional_fields.iter().map(|s| s.to_string()));
|
|
info!("add attributes.");
|
|
index
|
|
.set_filterable_attributes(new_filterable_attributes)
|
|
.await?;
|
|
index
|
|
.set_displayed_attributes(new_displayed_attributes)
|
|
.await?;
|
|
|
|
info!("Adding to index.");
|
|
|
|
add_to_index(client, index, projects).await?;
|
|
Ok(())
|
|
}
|
|
|
|
pub async fn add_projects(
|
|
indices: &[Index],
|
|
projects: Vec<UploadSearchProject>,
|
|
additional_fields: Vec<String>,
|
|
config: &SearchConfig,
|
|
) -> Result<(), IndexingError> {
|
|
let client = config.make_client();
|
|
for index in indices {
|
|
info!("adding projects part1 or 2.");
|
|
update_and_add_to_index(&client, index, &projects, &additional_fields).await?;
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
fn default_settings() -> Settings {
|
|
Settings::new()
|
|
.with_distinct_attribute("project_id")
|
|
.with_displayed_attributes(DEFAULT_DISPLAYED_ATTRIBUTES)
|
|
.with_searchable_attributes(DEFAULT_SEARCHABLE_ATTRIBUTES)
|
|
.with_sortable_attributes(DEFAULT_SORTABLE_ATTRIBUTES)
|
|
.with_filterable_attributes(DEFAULT_ATTRIBUTES_FOR_FACETING)
|
|
.with_pagination(PaginationSetting {
|
|
max_total_hits: 2147483647,
|
|
})
|
|
}
|
|
|
|
const DEFAULT_DISPLAYED_ATTRIBUTES: &[&str] = &[
|
|
"project_id",
|
|
"version_id",
|
|
"project_types",
|
|
"slug",
|
|
"author",
|
|
"name",
|
|
"summary",
|
|
"categories",
|
|
"display_categories",
|
|
"downloads",
|
|
"follows",
|
|
"icon_url",
|
|
"date_created",
|
|
"date_modified",
|
|
"latest_version",
|
|
"license",
|
|
"gallery",
|
|
"featured_gallery",
|
|
"color",
|
|
// Note: loader fields are not here, but are added on as they are needed (so they can be dynamically added depending on which exist).
|
|
|
|
// Non-searchable fields for filling out the Project model.
|
|
"license_url",
|
|
"monetization_status",
|
|
"team_id",
|
|
"thread_id",
|
|
"versions",
|
|
"date_published",
|
|
"date_queued",
|
|
"status",
|
|
"requested_status",
|
|
"games",
|
|
"organization_id",
|
|
"links",
|
|
"gallery_items",
|
|
"loaders", // search uses loaders as categories- this is purely for the Project model.
|
|
];
|
|
|
|
const DEFAULT_SEARCHABLE_ATTRIBUTES: &[&str] = &["name", "summary", "author", "slug"];
|
|
|
|
const DEFAULT_ATTRIBUTES_FOR_FACETING: &[&str] = &[
|
|
"categories",
|
|
"license",
|
|
"project_types",
|
|
"downloads",
|
|
"follows",
|
|
"author",
|
|
"name",
|
|
"date_created",
|
|
"created_timestamp",
|
|
"date_modified",
|
|
"modified_timestamp",
|
|
"project_id",
|
|
"open_source",
|
|
"color",
|
|
];
|
|
|
|
const DEFAULT_SORTABLE_ATTRIBUTES: &[&str] =
|
|
&["downloads", "follows", "date_created", "date_modified"];
|