Search overhaul (#771)

* started work; switching context

* working!

* fmt clippy prepare

* fixes

* fixes

* revs

* merge fixes

* changed comments

* merge issues
This commit is contained in:
Wyatt Verchere
2023-12-03 06:27:12 -08:00
committed by GitHub
parent a70df067bc
commit b2be4a7d67
18 changed files with 882 additions and 585 deletions

View File

@@ -6,171 +6,209 @@ use futures::TryStreamExt;
use log::info;
use super::IndexingError;
use crate::database::models::loader_fields::VersionField;
use crate::database::models::ProjectId;
use crate::database::models::{project_item, version_item, ProjectId, VersionId};
use crate::database::redis::RedisPool;
use crate::models;
use crate::search::UploadSearchProject;
use sqlx::postgres::PgPool;
pub async fn index_local(
pool: PgPool,
redis: &RedisPool,
) -> Result<(Vec<UploadSearchProject>, Vec<String>), IndexingError> {
info!("Indexing local projects!");
let loader_field_keys: Arc<DashSet<String>> = Arc::new(DashSet::new());
let uploads =
sqlx::query!(
"
WITH version_fields_cte AS (
SELECT version_id, field_id, int_value, enum_value, string_value
FROM version_fields
),
version_fields_json AS (
SELECT DISTINCT version_id,
JSONB_AGG(
DISTINCT jsonb_build_object('field_id', field_id, 'int_value', int_value, 'enum_value', enum_value, 'string_value', string_value)
) version_fields_json
FROM version_fields_cte
GROUP BY version_id
),
loader_fields_cte AS (
SELECT DISTINCT vf.version_id, lf.*, l.loader
FROM loader_fields lf
INNER JOIN version_fields_cte vf ON lf.id = vf.field_id
LEFT JOIN loaders_versions lv ON vf.version_id = lv.version_id
LEFT JOIN loaders l ON lv.loader_id = l.id
GROUP BY vf.version_id, lf.enum_type, lf.id, l.loader
),
loader_fields_json AS (
SELECT DISTINCT version_id,
JSONB_AGG(
DISTINCT jsonb_build_object(
'version_id', lf.version_id,
'lf_id', id, 'loader_name', loader, 'field', field, 'field_type', field_type, 'enum_type', enum_type, 'min_val', min_val, 'max_val', max_val, 'optional', optional
)
) filter (where lf.id is not null) loader_fields_json
FROM loader_fields_cte lf
GROUP BY version_id
),
loader_field_enum_values_json AS (
SELECT DISTINCT version_id,
JSONB_AGG(
DISTINCT jsonb_build_object(
'id', lfev.id, 'enum_id', lfev.enum_id, 'value', lfev.value, 'ordering', lfev.ordering, 'created', lfev.created, 'metadata', lfev.metadata
)
) filter (where lfev.id is not null) loader_field_enum_values_json
FROM loader_field_enum_values lfev
INNER JOIN loader_fields_cte lf on lf.enum_type = lfev.enum_id
GROUP BY version_id
)
SELECT m.id id, v.id version_id, m.name name, m.description description, m.downloads downloads, m.follows follows,
m.icon_url icon_url, m.published published, m.approved approved, m.updated updated,
m.team_id team_id, m.license license, m.slug slug, m.status status_name, m.color color,
u.username username,
ARRAY_AGG(DISTINCT c.category) filter (where c.category is not null and mc.is_additional is false) categories,
ARRAY_AGG(DISTINCT c.category) filter (where c.category is not null and mc.is_additional is true) additional_categories,
ARRAY_AGG(DISTINCT lo.loader) filter (where lo.loader is not null) loaders,
ARRAY_AGG(DISTINCT pt.name) filter (where pt.name is not null) project_types,
ARRAY_AGG(DISTINCT g.slug) filter (where g.slug is not null) games,
ARRAY_AGG(DISTINCT mg.image_url) filter (where mg.image_url is not null and mg.featured is false) gallery,
ARRAY_AGG(DISTINCT mg.image_url) filter (where mg.image_url is not null and mg.featured is true) featured_gallery,
vf.version_fields_json version_fields,
lf.loader_fields_json loader_fields,
lfev.loader_field_enum_values_json loader_field_enum_values
FROM versions v
INNER JOIN mods m ON v.mod_id = m.id AND m.status = ANY($2)
LEFT OUTER JOIN mods_categories mc ON joining_mod_id = m.id
LEFT OUTER JOIN categories c ON mc.joining_category_id = c.id
LEFT OUTER JOIN loaders_versions lv ON lv.version_id = v.id
LEFT OUTER JOIN loaders lo ON lo.id = lv.loader_id
LEFT JOIN loaders_project_types lpt ON lpt.joining_loader_id = lo.id
LEFT JOIN project_types pt ON pt.id = lpt.joining_project_type_id
LEFT JOIN loaders_project_types_games lptg ON lptg.loader_id = lo.id AND lptg.project_type_id = pt.id
LEFT JOIN games g ON lptg.game_id = g.id
LEFT OUTER JOIN mods_gallery mg ON mg.mod_id = m.id
INNER JOIN team_members tm ON tm.team_id = m.team_id AND tm.is_owner = TRUE AND tm.accepted = TRUE
INNER JOIN users u ON tm.user_id = u.id
LEFT OUTER JOIN version_fields_json vf ON v.id = vf.version_id
LEFT OUTER JOIN loader_fields_json lf ON v.id = lf.version_id
LEFT OUTER JOIN loader_field_enum_values_json lfev ON v.id = lfev.version_id
WHERE v.status != ANY($1)
GROUP BY v.id, vf.version_fields_json, lf.loader_fields_json, lfev.loader_field_enum_values_json, m.id, u.id;
",
&*crate::models::projects::VersionStatus::iterator().filter(|x| x.is_hidden()).map(|x| x.to_string()).collect::<Vec<String>>(),
&*crate::models::projects::ProjectStatus::iterator().filter(|x| x.is_searchable()).map(|x| x.to_string()).collect::<Vec<String>>(),
)
.fetch_many(&pool)
.try_filter_map(|e| {
let loader_field_keys = loader_field_keys.clone();
async move {
Ok(e.right().map(|m| {
let mut additional_categories = m.additional_categories.unwrap_or_default();
let mut categories = m.categories.unwrap_or_default();
let all_visible_ids: HashMap<VersionId, (ProjectId, String)> = sqlx::query!(
"
SELECT v.id id, m.id mod_id, u.username owner_username
FROM versions v
INNER JOIN mods m ON v.mod_id = m.id AND m.status = ANY($2)
INNER JOIN team_members tm ON tm.team_id = m.team_id AND tm.is_owner = TRUE AND tm.accepted = TRUE
INNER JOIN users u ON tm.user_id = u.id
WHERE v.status != ANY($1)
GROUP BY v.id, m.id, u.id
ORDER BY m.id DESC;
",
&*crate::models::projects::VersionStatus::iterator()
.filter(|x| x.is_hidden())
.map(|x| x.to_string())
.collect::<Vec<String>>(),
&*crate::models::projects::ProjectStatus::iterator()
.filter(|x| x.is_searchable())
.map(|x| x.to_string())
.collect::<Vec<String>>(),
)
.fetch_many(&pool)
.try_filter_map(|e| async move {
Ok(e.right().map(|m| {
let project_id: ProjectId = ProjectId(m.mod_id);
let version_id: VersionId = VersionId(m.id);
(version_id, (project_id, m.owner_username))
}))
})
.try_collect::<HashMap<_, _>>()
.await?;
categories.append(&mut m.loaders.unwrap_or_default());
let project_ids = all_visible_ids
.values()
.map(|(project_id, _)| project_id)
.cloned()
.collect::<Vec<_>>();
let projects: HashMap<_, _> = project_item::Project::get_many_ids(&project_ids, &pool, redis)
.await?
.into_iter()
.map(|p| (p.inner.id, p))
.collect();
let display_categories = categories.clone();
categories.append(&mut additional_categories);
let version_ids = all_visible_ids.keys().cloned().collect::<Vec<_>>();
let versions: HashMap<_, _> = version_item::Version::get_many(&version_ids, &pool, redis)
.await?
.into_iter()
.map(|v| (v.inner.id, v))
.collect();
let version_fields = VersionField::from_query_json(m.loader_fields, m.version_fields, m.loader_field_enum_values, false);
let mut uploads = Vec::new();
// TODO: could possibly clone less here?
for (version_id, (project_id, owner_username)) in all_visible_ids {
let m = projects.get(&project_id);
let v = versions.get(&version_id);
let loader_fields : HashMap<String, Vec<String>> = version_fields.into_iter().map(|vf| {
(vf.field_name, vf.value.as_strings())
}).collect();
let m = match m {
Some(m) => m,
None => continue,
};
for v in loader_fields.keys().cloned() {
loader_field_keys.insert(v);
}
let v = match v {
Some(v) => v,
None => continue,
};
let project_id: crate::models::projects::ProjectId = ProjectId(m.id).into();
let version_id: crate::models::projects::ProjectId = ProjectId(m.version_id).into();
let version_id: crate::models::projects::VersionId = v.inner.id.into();
let project_id: crate::models::projects::ProjectId = m.inner.id.into();
let team_id: crate::models::teams::TeamId = m.inner.team_id.into();
let organization_id: Option<crate::models::organizations::OrganizationId> =
m.inner.organization_id.map(|x| x.into());
let thread_id: crate::models::threads::ThreadId = m.thread_id.into();
let license = match m.license.split(' ').next() {
Some(license) => license.to_string(),
None => m.license,
};
let all_version_ids = m
.versions
.iter()
.map(|v| (*v).into())
.collect::<Vec<crate::models::projects::VersionId>>();
let open_source = match spdx::license_id(&license) {
Some(id) => id.is_osi_approved(),
_ => false,
};
let mut additional_categories = m.additional_categories.clone();
let mut categories = m.categories.clone();
// SPECIAL BEHAVIOUR
// Todo: revisit.
// For consistency with v2 searching, we consider the loader field 'mrpack_loaders' to be a category.
// These were previously considered the loader, and in v2, the loader is a category for searching.
// So to avoid breakage or awkward conversions, we just consider those loader_fields to be categories.
// The loaders are kept in loader_fields as well, so that no information is lost on retrieval.
let mrpack_loaders = loader_fields.get("mrpack_loaders").cloned().unwrap_or_default();
categories.extend(mrpack_loaders);
// Uses version loaders, not project loaders.
categories.append(&mut v.loaders.clone());
let display_categories = categories.clone();
categories.append(&mut additional_categories);
let version_fields = v.version_fields.clone();
let loader_fields = models::projects::from_duplicate_version_fields(version_fields);
for v in loader_fields.keys().cloned() {
loader_field_keys.insert(v);
}
let license = match m.inner.license.split(' ').next() {
Some(license) => license.to_string(),
None => m.inner.license.clone(),
};
let open_source = match spdx::license_id(&license) {
Some(id) => id.is_osi_approved(),
_ => false,
};
// For loaders, get ALL loaders across ALL versions
let mut loaders = all_version_ids
.iter()
.fold(vec![], |mut loaders, version_id| {
let version = versions.get(&(*version_id).into());
if let Some(version) = version {
loaders.extend(version.loaders.clone());
}
loaders
});
loaders.sort();
loaders.dedup();
// SPECIAL BEHAVIOUR
// Todo: revisit.
// For consistency with v2 searching, we consider the loader field 'mrpack_loaders' to be a category.
// These were previously considered the loader, and in v2, the loader is a category for searching.
// So to avoid breakage or awkward conversions, we just consider those loader_fields to be categories.
// The loaders are kept in loader_fields as well, so that no information is lost on retrieval.
let mrpack_loaders = loader_fields
.get("mrpack_loaders")
.cloned()
.map(|x| {
x.into_iter()
.filter_map(|x| x.as_str().map(String::from))
.collect::<Vec<_>>()
})
.unwrap_or_default();
categories.extend(mrpack_loaders);
let gallery = m
.gallery_items
.iter()
.filter(|gi| !gi.featured)
.map(|gi| gi.image_url.clone())
.collect::<Vec<_>>();
let featured_gallery = m
.gallery_items
.iter()
.filter(|gi| gi.featured)
.map(|gi| gi.image_url.clone())
.collect::<Vec<_>>();
let featured_gallery = featured_gallery.first().cloned();
let usp = UploadSearchProject {
version_id: version_id.to_string(),
project_id: project_id.to_string(),
name: m.inner.name.clone(),
summary: m.inner.summary.clone(),
categories,
follows: m.inner.follows,
downloads: m.inner.downloads,
icon_url: m.inner.icon_url.clone(),
author: owner_username,
date_created: m.inner.approved.unwrap_or(m.inner.published),
created_timestamp: m.inner.approved.unwrap_or(m.inner.published).timestamp(),
date_modified: m.inner.updated,
modified_timestamp: m.inner.updated.timestamp(),
license,
slug: m.inner.slug.clone(),
project_types: m.project_types.clone(),
gallery,
featured_gallery,
display_categories,
open_source,
color: m.inner.color,
loader_fields,
license_url: m.inner.license_url.clone(),
monetization_status: Some(m.inner.monetization_status),
team_id: team_id.to_string(),
organization_id: organization_id.map(|x| x.to_string()),
thread_id: thread_id.to_string(),
versions: all_version_ids.iter().map(|x| x.to_string()).collect(),
date_published: m.inner.published,
date_queued: m.inner.queued,
status: m.inner.status,
requested_status: m.inner.requested_status,
games: m.games.clone(),
links: m.urls.clone(),
gallery_items: m.gallery_items.clone(),
loaders,
};
uploads.push(usp);
}
UploadSearchProject {
version_id: version_id.to_string(),
project_id: project_id.to_string(),
name: m.name,
description: m.description,
categories,
follows: m.follows,
downloads: m.downloads,
icon_url: m.icon_url.unwrap_or_default(),
author: m.username,
date_created: m.approved.unwrap_or(m.published),
created_timestamp: m.approved.unwrap_or(m.published).timestamp(),
date_modified: m.updated,
modified_timestamp: m.updated.timestamp(),
license,
slug: m.slug,
project_types: m.project_types.unwrap_or_default(),
gallery: m.gallery.unwrap_or_default(),
display_categories,
open_source,
color: m.color.map(|x| x as u32),
featured_gallery: m.featured_gallery.unwrap_or_default().first().cloned(),
loader_fields
}
}))
}})
.try_collect::<Vec<_>>()
.await?;
Ok((
uploads,
Arc::try_unwrap(loader_field_keys)

View File

@@ -1,6 +1,7 @@
/// This module is used for the indexing from any source.
pub mod local_import;
use crate::database::redis::RedisPool;
use crate::search::{SearchConfig, UploadSearchProject};
use local_import::index_local;
use meilisearch_sdk::client::Client;
@@ -30,11 +31,15 @@ pub enum IndexingError {
// assumes a max average size of 1KiB per project to avoid this cap.
const MEILISEARCH_CHUNK_SIZE: usize = 10000;
pub async fn index_projects(pool: PgPool, config: &SearchConfig) -> Result<(), IndexingError> {
pub async fn index_projects(
pool: PgPool,
redis: RedisPool,
config: &SearchConfig,
) -> Result<(), IndexingError> {
let mut docs_to_add: Vec<UploadSearchProject> = vec![];
let mut additional_fields: Vec<String> = vec![];
let (mut uploads, mut loader_fields) = index_local(pool.clone()).await?;
let (mut uploads, mut loader_fields) = index_local(pool.clone(), &redis).await?;
docs_to_add.append(&mut uploads);
additional_fields.append(&mut loader_fields);
@@ -186,7 +191,7 @@ const DEFAULT_DISPLAYED_ATTRIBUTES: &[&str] = &[
"slug",
"author",
"name",
"description",
"summary",
"categories",
"display_categories",
"downloads",
@@ -199,9 +204,26 @@ const DEFAULT_DISPLAYED_ATTRIBUTES: &[&str] = &[
"gallery",
"featured_gallery",
"color",
// Note: loader fields are not here, but are added on as they are needed (so they can be dynamically added depending on which exist).
// Non-searchable fields for filling out the Project model.
"license_url",
"monetization_status",
"team_id",
"thread_id",
"versions",
"date_published",
"date_queued",
"status",
"requested_status",
"games",
"organization_id",
"links",
"gallery_items",
"loaders", // search uses loaders as categories- this is purely for the Project model.
];
const DEFAULT_SEARCHABLE_ATTRIBUTES: &[&str] = &["name", "description", "author", "slug"];
const DEFAULT_SEARCHABLE_ATTRIBUTES: &[&str] = &["name", "summary", "author", "slug"];
const DEFAULT_ATTRIBUTES_FOR_FACETING: &[&str] = &[
"categories",

View File

@@ -1,5 +1,6 @@
use crate::database::models::project_item::{GalleryItem, LinkUrl};
use crate::models::error::ApiError;
use crate::models::projects::SearchRequest;
use crate::models::projects::{MonetizationStatus, ProjectStatus, SearchRequest};
use actix_web::http::StatusCode;
use actix_web::HttpResponse;
use chrono::{DateTime, Utc};
@@ -80,12 +81,12 @@ pub struct UploadSearchProject {
pub slug: Option<String>,
pub author: String,
pub name: String,
pub description: String,
pub summary: String,
pub categories: Vec<String>,
pub display_categories: Vec<String>,
pub follows: i32,
pub downloads: i32,
pub icon_url: String,
pub icon_url: Option<String>,
pub license: String,
pub gallery: Vec<String>,
pub featured_gallery: Option<String>,
@@ -100,8 +101,24 @@ pub struct UploadSearchProject {
pub open_source: bool,
pub color: Option<u32>,
// Hidden fields to get the Project model out of the search results.
pub license_url: Option<String>,
pub monetization_status: Option<MonetizationStatus>,
pub team_id: String,
pub thread_id: String,
pub versions: Vec<String>,
pub date_published: DateTime<Utc>,
pub date_queued: Option<DateTime<Utc>>,
pub status: ProjectStatus,
pub requested_status: Option<ProjectStatus>,
pub loaders: Vec<String>, // Search uses loaders as categories- this is purely for the Project model.
pub links: Vec<LinkUrl>,
pub gallery_items: Vec<GalleryItem>, // Gallery *only* urls are stored in gallery, but the gallery items are stored here- required for the Project model.
pub games: Vec<String>, // Todo: in future, could be a searchable field.
pub organization_id: Option<String>, // Todo: in future, could be a searchable field.
#[serde(flatten)]
pub loader_fields: HashMap<String, Vec<String>>,
pub loader_fields: HashMap<String, Vec<serde_json::Value>>,
}
#[derive(Serialize, Deserialize, Debug)]
@@ -120,12 +137,12 @@ pub struct ResultSearchProject {
pub slug: Option<String>,
pub author: String,
pub name: String,
pub description: String,
pub summary: String,
pub categories: Vec<String>,
pub display_categories: Vec<String>,
pub downloads: i32,
pub follows: i32,
pub icon_url: String,
pub icon_url: Option<String>,
/// RFC 3339 formatted creation date of the project
pub date_created: String,
/// RFC 3339 formatted modification date of the project
@@ -135,8 +152,24 @@ pub struct ResultSearchProject {
pub featured_gallery: Option<String>,
pub color: Option<u32>,
// Hidden fields to get the Project model out of the search results.
pub license_url: Option<String>,
pub monetization_status: Option<String>,
pub team_id: String,
pub thread_id: String,
pub versions: Vec<String>,
pub date_published: String,
pub date_queued: Option<String>,
pub status: String,
pub requested_status: Option<String>,
pub loaders: Vec<String>, // Search uses loaders as categories- this is purely for the Project model.
pub links: Vec<LinkUrl>,
pub games: Vec<String>, // Todo: in future, could be a searchable field.
pub gallery_items: Vec<GalleryItem>, // Gallery *only* urls are stored in gallery, but the gallery items are stored here- required for the Project model.
pub organization_id: Option<String>, // Todo: in future, could be a searchable field.
#[serde(flatten)]
pub loader_fields: HashMap<String, Vec<String>>,
pub loader_fields: HashMap<String, Vec<serde_json::Value>>,
}
pub fn get_sort_index(index: &str) -> Result<(&str, [&str; 1]), SearchError> {