Search overhaul (#771)

* started work; switching context * working! * fmt clippy prepare * fixes * fixes * revs * merge fixes * changed comments * merge issues
2023-12-03 06:27:12 -08:00
parent a70df067bc
commit b2be4a7d67
18 changed files with 882 additions and 585 deletions
--- a/src/search/indexing/local_import.rs
+++ b/src/search/indexing/local_import.rs
@@ -6,171 +6,209 @@ use futures::TryStreamExt;
 use log::info;

 use super::IndexingError;
-use crate::database::models::loader_fields::VersionField;
-use crate::database::models::ProjectId;
+use crate::database::models::{project_item, version_item, ProjectId, VersionId};
+use crate::database::redis::RedisPool;
+use crate::models;
 use crate::search::UploadSearchProject;
 use sqlx::postgres::PgPool;

 pub async fn index_local(
    pool: PgPool,
+    redis: &RedisPool,
 ) -> Result<(Vec<UploadSearchProject>, Vec<String>), IndexingError> {
    info!("Indexing local projects!");
    let loader_field_keys: Arc<DashSet<String>> = Arc::new(DashSet::new());
-    let uploads =
-        sqlx::query!(
-            "
-            WITH version_fields_cte AS (
-                SELECT version_id, field_id, int_value, enum_value, string_value
-                FROM version_fields
-            ),
-            version_fields_json AS (
-                SELECT DISTINCT version_id,
-                JSONB_AGG( 
-                    DISTINCT jsonb_build_object('field_id', field_id, 'int_value', int_value, 'enum_value', enum_value, 'string_value', string_value)
-                ) version_fields_json
-                FROM version_fields_cte
-                GROUP BY version_id
-            ),
-            loader_fields_cte AS (
-                SELECT DISTINCT vf.version_id, lf.*, l.loader
-                FROM loader_fields lf
-                INNER JOIN version_fields_cte vf ON lf.id = vf.field_id
-                LEFT JOIN loaders_versions lv ON vf.version_id = lv.version_id
-                LEFT JOIN loaders l ON lv.loader_id = l.id
-                GROUP BY vf.version_id, lf.enum_type, lf.id, l.loader
-            ),
-            loader_fields_json AS (
-                SELECT DISTINCT version_id,
-                    JSONB_AGG(
-                    DISTINCT jsonb_build_object(
-                        'version_id', lf.version_id,
-                        'lf_id', id, 'loader_name', loader, 'field', field, 'field_type', field_type, 'enum_type', enum_type, 'min_val', min_val, 'max_val', max_val, 'optional', optional
-                    )
-                ) filter (where lf.id is not null) loader_fields_json
-                FROM loader_fields_cte lf
-                GROUP BY version_id
-            ),
-            loader_field_enum_values_json AS (
-                SELECT DISTINCT version_id,
-                    JSONB_AGG(
-                    DISTINCT jsonb_build_object(
-                        'id', lfev.id, 'enum_id', lfev.enum_id, 'value', lfev.value, 'ordering', lfev.ordering, 'created', lfev.created, 'metadata', lfev.metadata
-                    ) 
-                ) filter (where lfev.id is not null) loader_field_enum_values_json
-                FROM loader_field_enum_values lfev
-                INNER JOIN loader_fields_cte lf on lf.enum_type = lfev.enum_id
-                GROUP BY version_id
-            )

-            SELECT m.id id, v.id version_id, m.name name, m.description description, m.downloads downloads, m.follows follows,
-            m.icon_url icon_url, m.published published, m.approved approved, m.updated updated,
-            m.team_id team_id, m.license license, m.slug slug, m.status status_name, m.color color,
-            u.username username,
-            ARRAY_AGG(DISTINCT c.category) filter (where c.category is not null and mc.is_additional is false) categories,
-            ARRAY_AGG(DISTINCT c.category) filter (where c.category is not null and mc.is_additional is true) additional_categories,
-            ARRAY_AGG(DISTINCT lo.loader) filter (where lo.loader is not null) loaders,
-            ARRAY_AGG(DISTINCT pt.name) filter (where pt.name is not null) project_types,
-            ARRAY_AGG(DISTINCT g.slug) filter (where g.slug is not null) games,
-            ARRAY_AGG(DISTINCT mg.image_url) filter (where mg.image_url is not null and mg.featured is false) gallery,
-            ARRAY_AGG(DISTINCT mg.image_url) filter (where mg.image_url is not null and mg.featured is true) featured_gallery,
-            vf.version_fields_json version_fields,
-            lf.loader_fields_json loader_fields,
-            lfev.loader_field_enum_values_json loader_field_enum_values
-            FROM versions v
-            INNER JOIN mods m ON v.mod_id = m.id AND m.status = ANY($2)
-            LEFT OUTER JOIN mods_categories mc ON joining_mod_id = m.id
-            LEFT OUTER JOIN categories c ON mc.joining_category_id = c.id
-            LEFT OUTER JOIN loaders_versions lv ON lv.version_id = v.id
-            LEFT OUTER JOIN loaders lo ON lo.id = lv.loader_id
-            LEFT JOIN loaders_project_types lpt ON lpt.joining_loader_id = lo.id
-            LEFT JOIN project_types pt ON pt.id = lpt.joining_project_type_id
-            LEFT JOIN loaders_project_types_games lptg ON lptg.loader_id = lo.id AND lptg.project_type_id = pt.id
-            LEFT JOIN games g ON lptg.game_id = g.id
-            LEFT OUTER JOIN mods_gallery mg ON mg.mod_id = m.id
-            INNER JOIN team_members tm ON tm.team_id = m.team_id AND tm.is_owner = TRUE AND tm.accepted = TRUE
-            INNER JOIN users u ON tm.user_id = u.id
-            LEFT OUTER JOIN version_fields_json vf ON v.id = vf.version_id
-            LEFT OUTER JOIN loader_fields_json lf ON v.id = lf.version_id
-            LEFT OUTER JOIN loader_field_enum_values_json lfev ON v.id = lfev.version_id
-            WHERE v.status != ANY($1)
-            GROUP BY v.id, vf.version_fields_json, lf.loader_fields_json, lfev.loader_field_enum_values_json, m.id, u.id;
-            ",
-            &*crate::models::projects::VersionStatus::iterator().filter(|x| x.is_hidden()).map(|x| x.to_string()).collect::<Vec<String>>(),
-            &*crate::models::projects::ProjectStatus::iterator().filter(|x| x.is_searchable()).map(|x| x.to_string()).collect::<Vec<String>>(),
-        )
-            .fetch_many(&pool)
-            .try_filter_map(|e| {
-                let loader_field_keys = loader_field_keys.clone();
-                async move {
-                Ok(e.right().map(|m| {
-                    let mut additional_categories = m.additional_categories.unwrap_or_default();
-                    let mut categories = m.categories.unwrap_or_default();
+    let all_visible_ids: HashMap<VersionId, (ProjectId, String)> = sqlx::query!(
+        "
+        SELECT v.id id, m.id mod_id, u.username owner_username
+        
+        FROM versions v
+        INNER JOIN mods m ON v.mod_id = m.id AND m.status = ANY($2)
+        INNER JOIN team_members tm ON tm.team_id = m.team_id AND tm.is_owner = TRUE AND tm.accepted = TRUE
+        INNER JOIN users u ON tm.user_id = u.id
+        WHERE v.status != ANY($1)
+        GROUP BY v.id, m.id, u.id
+        ORDER BY m.id DESC;
+        ",
+        &*crate::models::projects::VersionStatus::iterator()
+            .filter(|x| x.is_hidden())
+            .map(|x| x.to_string())
+            .collect::<Vec<String>>(),
+        &*crate::models::projects::ProjectStatus::iterator()
+            .filter(|x| x.is_searchable())
+            .map(|x| x.to_string())
+            .collect::<Vec<String>>(),
+    )
+    .fetch_many(&pool)
+    .try_filter_map(|e| async move {
+        Ok(e.right().map(|m| {
+            let project_id: ProjectId = ProjectId(m.mod_id);
+            let version_id: VersionId = VersionId(m.id);
+            (version_id, (project_id, m.owner_username))
+        }))
+    })
+    .try_collect::<HashMap<_, _>>()
+    .await?;

-                    categories.append(&mut m.loaders.unwrap_or_default());
+    let project_ids = all_visible_ids
+        .values()
+        .map(|(project_id, _)| project_id)
+        .cloned()
+        .collect::<Vec<_>>();
+    let projects: HashMap<_, _> = project_item::Project::get_many_ids(&project_ids, &pool, redis)
+        .await?
+        .into_iter()
+        .map(|p| (p.inner.id, p))
+        .collect();

-                    let display_categories = categories.clone();
-                    categories.append(&mut additional_categories);
+    let version_ids = all_visible_ids.keys().cloned().collect::<Vec<_>>();
+    let versions: HashMap<_, _> = version_item::Version::get_many(&version_ids, &pool, redis)
+        .await?
+        .into_iter()
+        .map(|v| (v.inner.id, v))
+        .collect();

-                    let version_fields = VersionField::from_query_json(m.loader_fields, m.version_fields, m.loader_field_enum_values, false);
+    let mut uploads = Vec::new();
+    // TODO: could possibly clone less here?
+    for (version_id, (project_id, owner_username)) in all_visible_ids {
+        let m = projects.get(&project_id);
+        let v = versions.get(&version_id);

-                    let loader_fields : HashMap<String, Vec<String>> = version_fields.into_iter().map(|vf| {
-                        (vf.field_name, vf.value.as_strings())
-                    }).collect();
+        let m = match m {
+            Some(m) => m,
+            None => continue,
+        };

-                    for v in loader_fields.keys().cloned() {
-                        loader_field_keys.insert(v);
-                    }
+        let v = match v {
+            Some(v) => v,
+            None => continue,
+        };

-                    let project_id: crate::models::projects::ProjectId = ProjectId(m.id).into();
-                    let version_id: crate::models::projects::ProjectId = ProjectId(m.version_id).into();
+        let version_id: crate::models::projects::VersionId = v.inner.id.into();
+        let project_id: crate::models::projects::ProjectId = m.inner.id.into();
+        let team_id: crate::models::teams::TeamId = m.inner.team_id.into();
+        let organization_id: Option<crate::models::organizations::OrganizationId> =
+            m.inner.organization_id.map(|x| x.into());
+        let thread_id: crate::models::threads::ThreadId = m.thread_id.into();

-                    let license = match m.license.split(' ').next() {
-                        Some(license) => license.to_string(),
-                        None => m.license,
-                    };
+        let all_version_ids = m
+            .versions
+            .iter()
+            .map(|v| (*v).into())
+            .collect::<Vec<crate::models::projects::VersionId>>();

-                    let open_source = match spdx::license_id(&license) {
-                        Some(id) => id.is_osi_approved(),
-                        _ => false,
-                    };
+        let mut additional_categories = m.additional_categories.clone();
+        let mut categories = m.categories.clone();

-                    // SPECIAL BEHAVIOUR
-                    // Todo: revisit.
-                    // For consistency with v2 searching, we consider the loader field 'mrpack_loaders' to be a category.
-                    // These were previously considered the loader, and in v2, the loader is a category for searching.
-                    // So to avoid breakage or awkward conversions, we just consider those loader_fields to be categories.
-                    // The loaders are kept in loader_fields as well, so that no information is lost on retrieval.
-                    let mrpack_loaders = loader_fields.get("mrpack_loaders").cloned().unwrap_or_default();
-                    categories.extend(mrpack_loaders);
+        // Uses version loaders, not project loaders.
+        categories.append(&mut v.loaders.clone());
+
+        let display_categories = categories.clone();
+        categories.append(&mut additional_categories);
+
+        let version_fields = v.version_fields.clone();
+        let loader_fields = models::projects::from_duplicate_version_fields(version_fields);
+        for v in loader_fields.keys().cloned() {
+            loader_field_keys.insert(v);
+        }
+
+        let license = match m.inner.license.split(' ').next() {
+            Some(license) => license.to_string(),
+            None => m.inner.license.clone(),
+        };
+
+        let open_source = match spdx::license_id(&license) {
+            Some(id) => id.is_osi_approved(),
+            _ => false,
+        };
+
+        // For loaders, get ALL loaders across ALL versions
+        let mut loaders = all_version_ids
+            .iter()
+            .fold(vec![], |mut loaders, version_id| {
+                let version = versions.get(&(*version_id).into());
+                if let Some(version) = version {
+                    loaders.extend(version.loaders.clone());
+                }
+                loaders
+            });
+        loaders.sort();
+        loaders.dedup();
+
+        // SPECIAL BEHAVIOUR
+        // Todo: revisit.
+        // For consistency with v2 searching, we consider the loader field 'mrpack_loaders' to be a category.
+        // These were previously considered the loader, and in v2, the loader is a category for searching.
+        // So to avoid breakage or awkward conversions, we just consider those loader_fields to be categories.
+        // The loaders are kept in loader_fields as well, so that no information is lost on retrieval.
+        let mrpack_loaders = loader_fields
+            .get("mrpack_loaders")
+            .cloned()
+            .map(|x| {
+                x.into_iter()
+                    .filter_map(|x| x.as_str().map(String::from))
+                    .collect::<Vec<_>>()
+            })
+            .unwrap_or_default();
+        categories.extend(mrpack_loaders);
+
+        let gallery = m
+            .gallery_items
+            .iter()
+            .filter(|gi| !gi.featured)
+            .map(|gi| gi.image_url.clone())
+            .collect::<Vec<_>>();
+        let featured_gallery = m
+            .gallery_items
+            .iter()
+            .filter(|gi| gi.featured)
+            .map(|gi| gi.image_url.clone())
+            .collect::<Vec<_>>();
+        let featured_gallery = featured_gallery.first().cloned();
+
+        let usp = UploadSearchProject {
+            version_id: version_id.to_string(),
+            project_id: project_id.to_string(),
+            name: m.inner.name.clone(),
+            summary: m.inner.summary.clone(),
+            categories,
+            follows: m.inner.follows,
+            downloads: m.inner.downloads,
+            icon_url: m.inner.icon_url.clone(),
+            author: owner_username,
+            date_created: m.inner.approved.unwrap_or(m.inner.published),
+            created_timestamp: m.inner.approved.unwrap_or(m.inner.published).timestamp(),
+            date_modified: m.inner.updated,
+            modified_timestamp: m.inner.updated.timestamp(),
+            license,
+            slug: m.inner.slug.clone(),
+            project_types: m.project_types.clone(),
+            gallery,
+            featured_gallery,
+            display_categories,
+            open_source,
+            color: m.inner.color,
+            loader_fields,
+            license_url: m.inner.license_url.clone(),
+            monetization_status: Some(m.inner.monetization_status),
+            team_id: team_id.to_string(),
+            organization_id: organization_id.map(|x| x.to_string()),
+            thread_id: thread_id.to_string(),
+            versions: all_version_ids.iter().map(|x| x.to_string()).collect(),
+            date_published: m.inner.published,
+            date_queued: m.inner.queued,
+            status: m.inner.status,
+            requested_status: m.inner.requested_status,
+            games: m.games.clone(),
+            links: m.urls.clone(),
+            gallery_items: m.gallery_items.clone(),
+            loaders,
+        };
+
+        uploads.push(usp);
+    }

-                    UploadSearchProject {
-                        version_id: version_id.to_string(),
-                        project_id: project_id.to_string(),
-                        name: m.name,
-                        description: m.description,
-                        categories,
-                        follows: m.follows,
-                        downloads: m.downloads,
-                        icon_url: m.icon_url.unwrap_or_default(),
-                        author: m.username,
-                        date_created: m.approved.unwrap_or(m.published),
-                        created_timestamp: m.approved.unwrap_or(m.published).timestamp(),
-                        date_modified: m.updated,
-                        modified_timestamp: m.updated.timestamp(),
-                        license,
-                        slug: m.slug,
-                        project_types: m.project_types.unwrap_or_default(),
-                        gallery: m.gallery.unwrap_or_default(),
-                        display_categories,
-                        open_source,
-                        color: m.color.map(|x| x as u32),
-                        featured_gallery: m.featured_gallery.unwrap_or_default().first().cloned(),
-                        loader_fields
-                    }
-                }))
-}})
-            .try_collect::<Vec<_>>()
-            .await?;
    Ok((
        uploads,
        Arc::try_unwrap(loader_field_keys)
--- a/src/search/indexing/mod.rs
+++ b/src/search/indexing/mod.rs
@@ -1,6 +1,7 @@
 /// This module is used for the indexing from any source.
 pub mod local_import;

+use crate::database::redis::RedisPool;
 use crate::search::{SearchConfig, UploadSearchProject};
 use local_import::index_local;
 use meilisearch_sdk::client::Client;
@@ -30,11 +31,15 @@ pub enum IndexingError {
 // assumes a max average size of 1KiB per project to avoid this cap.
 const MEILISEARCH_CHUNK_SIZE: usize = 10000;

-pub async fn index_projects(pool: PgPool, config: &SearchConfig) -> Result<(), IndexingError> {
+pub async fn index_projects(
+    pool: PgPool,
+    redis: RedisPool,
+    config: &SearchConfig,
+) -> Result<(), IndexingError> {
    let mut docs_to_add: Vec<UploadSearchProject> = vec![];
    let mut additional_fields: Vec<String> = vec![];

-    let (mut uploads, mut loader_fields) = index_local(pool.clone()).await?;
+    let (mut uploads, mut loader_fields) = index_local(pool.clone(), &redis).await?;
    docs_to_add.append(&mut uploads);
    additional_fields.append(&mut loader_fields);

@@ -186,7 +191,7 @@ const DEFAULT_DISPLAYED_ATTRIBUTES: &[&str] = &[
    "slug",
    "author",
    "name",
-    "description",
+    "summary",
    "categories",
    "display_categories",
    "downloads",
@@ -199,9 +204,26 @@ const DEFAULT_DISPLAYED_ATTRIBUTES: &[&str] = &[
    "gallery",
    "featured_gallery",
    "color",
+    // Note: loader fields are not here, but are added on as they are needed (so they can be dynamically added depending on which exist).
+
+    // Non-searchable fields for filling out the Project model.
+    "license_url",
+    "monetization_status",
+    "team_id",
+    "thread_id",
+    "versions",
+    "date_published",
+    "date_queued",
+    "status",
+    "requested_status",
+    "games",
+    "organization_id",
+    "links",
+    "gallery_items",
+    "loaders", // search uses loaders as categories- this is purely for the Project model.
 ];

-const DEFAULT_SEARCHABLE_ATTRIBUTES: &[&str] = &["name", "description", "author", "slug"];
+const DEFAULT_SEARCHABLE_ATTRIBUTES: &[&str] = &["name", "summary", "author", "slug"];

 const DEFAULT_ATTRIBUTES_FOR_FACETING: &[&str] = &[
    "categories",
--- a/src/search/mod.rs
+++ b/src/search/mod.rs
@@ -1,5 +1,6 @@
+use crate::database::models::project_item::{GalleryItem, LinkUrl};
 use crate::models::error::ApiError;
-use crate::models::projects::SearchRequest;
+use crate::models::projects::{MonetizationStatus, ProjectStatus, SearchRequest};
 use actix_web::http::StatusCode;
 use actix_web::HttpResponse;
 use chrono::{DateTime, Utc};
@@ -80,12 +81,12 @@ pub struct UploadSearchProject {
    pub slug: Option<String>,
    pub author: String,
    pub name: String,
-    pub description: String,
+    pub summary: String,
    pub categories: Vec<String>,
    pub display_categories: Vec<String>,
    pub follows: i32,
    pub downloads: i32,
-    pub icon_url: String,
+    pub icon_url: Option<String>,
    pub license: String,
    pub gallery: Vec<String>,
    pub featured_gallery: Option<String>,
@@ -100,8 +101,24 @@ pub struct UploadSearchProject {
    pub open_source: bool,
    pub color: Option<u32>,

+    // Hidden fields to get the Project model out of the search results.
+    pub license_url: Option<String>,
+    pub monetization_status: Option<MonetizationStatus>,
+    pub team_id: String,
+    pub thread_id: String,
+    pub versions: Vec<String>,
+    pub date_published: DateTime<Utc>,
+    pub date_queued: Option<DateTime<Utc>>,
+    pub status: ProjectStatus,
+    pub requested_status: Option<ProjectStatus>,
+    pub loaders: Vec<String>, // Search uses loaders as categories- this is purely for the Project model.
+    pub links: Vec<LinkUrl>,
+    pub gallery_items: Vec<GalleryItem>, // Gallery *only* urls are stored in gallery, but the gallery items are stored here- required for the Project model.
+    pub games: Vec<String>,              // Todo: in future, could be a searchable field.
+    pub organization_id: Option<String>, // Todo: in future, could be a searchable field.
+
    #[serde(flatten)]
-    pub loader_fields: HashMap<String, Vec<String>>,
+    pub loader_fields: HashMap<String, Vec<serde_json::Value>>,
 }

 #[derive(Serialize, Deserialize, Debug)]
@@ -120,12 +137,12 @@ pub struct ResultSearchProject {
    pub slug: Option<String>,
    pub author: String,
    pub name: String,
-    pub description: String,
+    pub summary: String,
    pub categories: Vec<String>,
    pub display_categories: Vec<String>,
    pub downloads: i32,
    pub follows: i32,
-    pub icon_url: String,
+    pub icon_url: Option<String>,
    /// RFC 3339 formatted creation date of the project
    pub date_created: String,
    /// RFC 3339 formatted modification date of the project
@@ -135,8 +152,24 @@ pub struct ResultSearchProject {
    pub featured_gallery: Option<String>,
    pub color: Option<u32>,

+    // Hidden fields to get the Project model out of the search results.
+    pub license_url: Option<String>,
+    pub monetization_status: Option<String>,
+    pub team_id: String,
+    pub thread_id: String,
+    pub versions: Vec<String>,
+    pub date_published: String,
+    pub date_queued: Option<String>,
+    pub status: String,
+    pub requested_status: Option<String>,
+    pub loaders: Vec<String>, // Search uses loaders as categories- this is purely for the Project model.
+    pub links: Vec<LinkUrl>,
+    pub games: Vec<String>, // Todo: in future, could be a searchable field.
+    pub gallery_items: Vec<GalleryItem>, // Gallery *only* urls are stored in gallery, but the gallery items are stored here- required for the Project model.
+    pub organization_id: Option<String>, // Todo: in future, could be a searchable field.
+
    #[serde(flatten)]
-    pub loader_fields: HashMap<String, Vec<String>>,
+    pub loader_fields: HashMap<String, Vec<serde_json::Value>>,
 }

 pub fn get_sort_index(index: &str) -> Result<(&str, [&str; 1]), SearchError> {