Files
AstralRinth/apps/labrinth/src/search/mod.rs
François-Xavier Talbot 3ffa78aa07 Allow many Meilisearch write addrs (#5102)
* Write to many Meilisearch write addrs

* Keep client results ordered

* Attach Read Meilisearch client to actix data

* Load balanced meilisearch Compose profile

* Nginx config (round_robin)

* Fix nginx

* Meilisearch + nginx in same net

* Fix env vars example

* Fix env example again

* Fix env again

* Use try_collect with FuturesOrdered

* maybe fix remove_documents

* Clippy
2026-01-15 01:38:09 +01:00

410 lines
14 KiB
Rust

use crate::models::error::ApiError;
use crate::models::projects::SearchRequest;
use actix_web::HttpResponse;
use actix_web::http::StatusCode;
use chrono::{DateTime, Utc};
use futures::TryStreamExt;
use futures::stream::FuturesOrdered;
use itertools::Itertools;
use meilisearch_sdk::client::Client;
use serde::{Deserialize, Serialize};
use serde_json::Value;
use std::borrow::Cow;
use std::collections::HashMap;
use std::fmt::Write;
use thiserror::Error;
use tracing::{Instrument, info_span};
pub mod indexing;
#[derive(Error, Debug)]
pub enum SearchError {
#[error("MeiliSearch Error: {0}")]
MeiliSearch(#[from] meilisearch_sdk::errors::Error),
#[error("Error while serializing or deserializing JSON: {0}")]
Serde(#[from] serde_json::Error),
#[error("Error while parsing an integer: {0}")]
IntParsing(#[from] std::num::ParseIntError),
#[error("Error while formatting strings: {0}")]
FormatError(#[from] std::fmt::Error),
#[error("Environment Error")]
Env(#[from] dotenvy::Error),
#[error("Invalid index to sort by: {0}")]
InvalidIndex(String),
}
impl actix_web::ResponseError for SearchError {
fn status_code(&self) -> StatusCode {
match self {
SearchError::Env(..) => StatusCode::INTERNAL_SERVER_ERROR,
SearchError::MeiliSearch(..) => StatusCode::BAD_REQUEST,
SearchError::Serde(..) => StatusCode::BAD_REQUEST,
SearchError::IntParsing(..) => StatusCode::BAD_REQUEST,
SearchError::InvalidIndex(..) => StatusCode::BAD_REQUEST,
SearchError::FormatError(..) => StatusCode::BAD_REQUEST,
}
}
fn error_response(&self) -> HttpResponse {
HttpResponse::build(self.status_code()).json(ApiError {
error: match self {
SearchError::Env(..) => "environment_error",
SearchError::MeiliSearch(..) => "meilisearch_error",
SearchError::Serde(..) => "invalid_input",
SearchError::IntParsing(..) => "invalid_input",
SearchError::InvalidIndex(..) => "invalid_input",
SearchError::FormatError(..) => "invalid_input",
},
description: self.to_string(),
details: None,
})
}
}
#[derive(Debug, Clone)]
pub struct MeilisearchReadClient {
pub client: Client,
}
impl std::ops::Deref for MeilisearchReadClient {
type Target = Client;
fn deref(&self) -> &Self::Target {
&self.client
}
}
pub struct BatchClient {
pub clients: Vec<Client>,
}
impl BatchClient {
pub fn new(clients: Vec<Client>) -> Self {
Self { clients }
}
pub async fn with_all_clients<'a, T, G, Fut>(
&'a self,
task_name: &str,
generator: G,
) -> Result<Vec<T>, meilisearch_sdk::errors::Error>
where
G: Fn(&'a Client) -> Fut,
Fut: Future<Output = Result<T, meilisearch_sdk::errors::Error>> + 'a,
{
let mut tasks = FuturesOrdered::new();
for (idx, client) in self.clients.iter().enumerate() {
tasks.push_back(generator(client).instrument(info_span!(
"client_task",
task.name = task_name,
client.idx = idx,
)));
}
let results = tasks.try_collect::<Vec<T>>().await?;
Ok(results)
}
pub fn across_all<T, F, R>(&self, data: Vec<T>, mut predicate: F) -> Vec<R>
where
F: FnMut(T, &Client) -> R,
{
assert_eq!(
data.len(),
self.clients.len(),
"mismatch between data len and meilisearch client count"
);
self.clients
.iter()
.zip(data)
.map(|(client, item)| predicate(item, client))
.collect()
}
}
#[derive(Debug, Clone)]
pub struct SearchConfig {
pub addresses: Vec<String>,
pub read_lb_address: String,
pub key: String,
pub meta_namespace: String,
}
impl SearchConfig {
// Panics if the environment variables are not set,
// but these are already checked for on startup.
pub fn new(meta_namespace: Option<String>) -> Self {
let address_many = dotenvy::var("MEILISEARCH_WRITE_ADDRS")
.expect("MEILISEARCH_WRITE_ADDRS not set");
let read_lb_address = dotenvy::var("MEILISEARCH_READ_ADDR")
.expect("MEILISEARCH_READ_ADDR not set");
let addresses = address_many
.split(',')
.filter(|s| !s.trim().is_empty())
.map(|s| s.to_string())
.collect::<Vec<String>>();
let key =
dotenvy::var("MEILISEARCH_KEY").expect("MEILISEARCH_KEY not set");
Self {
addresses,
key,
meta_namespace: meta_namespace.unwrap_or_default(),
read_lb_address,
}
}
pub fn make_loadbalanced_read_client(
&self,
) -> Result<MeilisearchReadClient, meilisearch_sdk::errors::Error> {
Ok(MeilisearchReadClient {
client: Client::new(&self.read_lb_address, Some(&self.key))?,
})
}
pub fn make_batch_client(
&self,
) -> Result<BatchClient, meilisearch_sdk::errors::Error> {
Ok(BatchClient::new(
self.addresses
.iter()
.map(|address| {
Client::new(address.as_str(), Some(self.key.as_str()))
})
.collect::<Result<Vec<_>, _>>()?,
))
}
// Next: true if we want the next index (we are preparing the next swap), false if we want the current index (searching)
pub fn get_index_name(&self, index: &str, next: bool) -> String {
let alt = if next { "_alt" } else { "" };
format!("{}_{}_{}", self.meta_namespace, index, alt)
}
}
/// A project document used for uploading projects to MeiliSearch's indices.
/// This contains some extra data that is not returned by search results.
#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct UploadSearchProject {
pub version_id: String,
pub project_id: String,
//
pub project_types: Vec<String>,
pub slug: Option<String>,
pub author: String,
pub name: String,
pub summary: String,
pub categories: Vec<String>,
pub display_categories: Vec<String>,
pub follows: i32,
pub downloads: i32,
pub icon_url: Option<String>,
pub license: String,
pub gallery: Vec<String>,
pub featured_gallery: Option<String>,
/// RFC 3339 formatted creation date of the project
pub date_created: DateTime<Utc>,
/// Unix timestamp of the creation date of the project
pub created_timestamp: i64,
/// RFC 3339 formatted date/time of last major modification (update)
pub date_modified: DateTime<Utc>,
/// Unix timestamp of the last major modification
pub modified_timestamp: i64,
pub open_source: bool,
pub color: Option<u32>,
// Hidden fields to get the Project model out of the search results.
pub loaders: Vec<String>, // Search uses loaders as categories- this is purely for the Project model.
pub project_loader_fields: HashMap<String, Vec<serde_json::Value>>, // Aggregation of loader_fields from all versions of the project, allowing for reconstruction of the Project model.
#[serde(flatten)]
pub loader_fields: HashMap<String, Vec<serde_json::Value>>,
}
#[derive(Serialize, Deserialize, Debug)]
pub struct SearchResults {
pub hits: Vec<ResultSearchProject>,
pub page: usize,
pub hits_per_page: usize,
pub total_hits: usize,
}
#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct ResultSearchProject {
pub version_id: String,
pub project_id: String,
pub project_types: Vec<String>,
pub slug: Option<String>,
pub author: String,
pub name: String,
pub summary: String,
pub categories: Vec<String>,
pub display_categories: Vec<String>,
pub downloads: i32,
pub follows: i32,
pub icon_url: Option<String>,
/// RFC 3339 formatted creation date of the project
pub date_created: String,
/// RFC 3339 formatted modification date of the project
pub date_modified: String,
pub license: String,
pub gallery: Vec<String>,
pub featured_gallery: Option<String>,
pub color: Option<u32>,
// Hidden fields to get the Project model out of the search results.
pub loaders: Vec<String>, // Search uses loaders as categories- this is purely for the Project model.
pub project_loader_fields: HashMap<String, Vec<serde_json::Value>>, // Aggregation of loader_fields from all versions of the project, allowing for reconstruction of the Project model.
#[serde(flatten)]
pub loader_fields: HashMap<String, Vec<serde_json::Value>>,
}
pub fn get_sort_index(
config: &SearchConfig,
index: &str,
) -> Result<(String, [&'static str; 1]), SearchError> {
let projects_name = config.get_index_name("projects", false);
let projects_filtered_name =
config.get_index_name("projects_filtered", false);
Ok(match index {
"relevance" => (projects_name, ["downloads:desc"]),
"downloads" => (projects_filtered_name, ["downloads:desc"]),
"follows" => (projects_name, ["follows:desc"]),
"updated" => (projects_name, ["date_modified:desc"]),
"newest" => (projects_name, ["date_created:desc"]),
i => return Err(SearchError::InvalidIndex(i.to_string())),
})
}
pub async fn search_for_project(
info: &SearchRequest,
config: &SearchConfig,
client: &MeilisearchReadClient,
) -> Result<SearchResults, SearchError> {
let offset: usize = info.offset.as_deref().unwrap_or("0").parse()?;
let index = info.index.as_deref().unwrap_or("relevance");
let limit = info
.limit
.as_deref()
.unwrap_or("10")
.parse::<usize>()?
.min(100);
let sort = get_sort_index(config, index)?;
let meilisearch_index = client.get_index(sort.0).await?;
let mut filter_string = String::new();
// Convert offset and limit to page and hits_per_page
let hits_per_page = if limit == 0 { 1 } else { limit };
let page = offset / hits_per_page + 1;
let results = {
let mut query = meilisearch_index.search();
query
.with_page(page)
.with_hits_per_page(hits_per_page)
.with_query(info.query.as_deref().unwrap_or_default())
.with_sort(&sort.1);
if let Some(new_filters) = info.new_filters.as_deref() {
query.with_filter(new_filters);
} else {
let facets = if let Some(facets) = &info.facets {
Some(serde_json::from_str::<Vec<Vec<Value>>>(facets)?)
} else {
None
};
let filters: Cow<_> =
match (info.filters.as_deref(), info.version.as_deref()) {
(Some(f), Some(v)) => format!("({f}) AND ({v})").into(),
(Some(f), None) => f.into(),
(None, Some(v)) => v.into(),
(None, None) => "".into(),
};
if let Some(facets) = facets {
// Search can now *optionally* have a third inner array: So Vec(AND)<Vec(OR)<Vec(AND)< _ >>>
// For every inner facet, we will check if it can be deserialized into a Vec<&str>, and do so.
// If not, we will assume it is a single facet and wrap it in a Vec.
let facets: Vec<Vec<Vec<String>>> = facets
.into_iter()
.map(|facets| {
facets
.into_iter()
.map(|facet| {
if facet.is_array() {
serde_json::from_value::<Vec<String>>(facet)
.unwrap_or_default()
} else {
vec![
serde_json::from_value::<String>(facet)
.unwrap_or_default(),
]
}
})
.collect_vec()
})
.collect_vec();
filter_string.push('(');
for (index, facet_outer_list) in facets.iter().enumerate() {
filter_string.push('(');
for (facet_outer_index, facet_inner_list) in
facet_outer_list.iter().enumerate()
{
filter_string.push('(');
for (facet_inner_index, facet) in
facet_inner_list.iter().enumerate()
{
filter_string.push_str(&facet.replace(':', " = "));
if facet_inner_index != (facet_inner_list.len() - 1)
{
filter_string.push_str(" AND ")
}
}
filter_string.push(')');
if facet_outer_index != (facet_outer_list.len() - 1) {
filter_string.push_str(" OR ")
}
}
filter_string.push(')');
if index != (facets.len() - 1) {
filter_string.push_str(" AND ")
}
}
filter_string.push(')');
if !filters.is_empty() {
write!(filter_string, " AND ({filters})")?;
}
} else {
filter_string.push_str(&filters);
}
if !filter_string.is_empty() {
query.with_filter(&filter_string);
}
}
query.execute::<ResultSearchProject>().await?
};
Ok(SearchResults {
hits: results.hits.into_iter().map(|r| r.result).collect(),
page: results.page.unwrap_or_default(),
hits_per_page: results.hits_per_page.unwrap_or_default(),
total_hits: results.total_hits.unwrap_or_default(),
})
}