From 03bf78b2a3113d20bae5c45501da1cf2093bfa81 Mon Sep 17 00:00:00 2001 From: Kubat <maelle.martin@proton.me> Date: Fri, 18 Oct 2024 18:35:07 +0200 Subject: [PATCH] SEARCH: Implement the base of matching karas --- Cargo.lock | 22 +++- Cargo.toml | 2 + amadeus/src/app/pages/search.rs | 4 +- kurisu_api/Cargo.toml | 25 ++-- kurisu_api/src/v2.rs | 9 +- lektor_nkdb/src/database/kara.rs | 9 -- lektor_nkdb/src/id.rs | 8 +- lektor_nkdb/src/lib.rs | 9 +- lektor_nkdb/src/search/kara_by.rs | 144 --------------------- lektor_nkdb/src/search/mod.rs | 83 ------------ lektor_payloads/src/filter.rs | 16 +++ lektor_payloads/src/lib.rs | 8 +- lektor_payloads/src/search.rs | 145 +++++++++++++++++++-- lektor_search/Cargo.toml | 16 +++ lektor_search/src/batch.rs | 204 ++++++++++++++++++++++++++++++ lektor_search/src/lib.rs | 45 +++++++ lektor_search/src/search.rs | 194 ++++++++++++++++++++++++++++ lektor_search/src/traits.rs | 37 ++++++ lektord/Cargo.toml | 3 +- 19 files changed, 701 insertions(+), 282 deletions(-) delete mode 100644 lektor_nkdb/src/search/kara_by.rs delete mode 100644 lektor_nkdb/src/search/mod.rs create mode 100644 lektor_payloads/src/filter.rs create mode 100644 lektor_search/Cargo.toml create mode 100644 lektor_search/src/batch.rs create mode 100644 lektor_search/src/lib.rs create mode 100644 lektor_search/src/search.rs create mode 100644 lektor_search/src/traits.rs diff --git a/Cargo.lock b/Cargo.lock index 2055200c..16b7f847 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -812,9 +812,9 @@ checksum = "64fa3c856b712db6612c019f14756e64e4bcea13337a6b33b696333a9eaa2d06" [[package]] name = "bytemuck" -version = "1.18.0" +version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94bbb0ad554ad961ddc5da507a12a29b14e4ae5bda06b19f575a3e6079d2e2ae" +checksum = "8334215b81e418a0a7bdb8ef0849474f40bb10c8b71f1c4ed315cff49f32494d" dependencies = [ "bytemuck_derive", ] @@ -2297,6 +2297,7 @@ checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" dependencies = [ "ahash", "allocator-api2", + "serde", ] [[package]] @@ -3018,6 +3019,7 @@ version = "8.0.1" dependencies = [ "derive_more", "hashbrown 0.15.0", + "lektor_procmacros", "lektor_utils", "log", "serde", @@ -3131,6 +3133,17 @@ dependencies = [ "tokio", ] +[[package]] +name = "lektor_search" +version = "8.0.1" +dependencies = [ + "aho-corasick", + "futures", + "hashbrown 0.14.5", + "lektor_payloads", + "log", +] + [[package]] name = "lektor_utils" version = "8.0.1" @@ -3162,6 +3175,7 @@ dependencies = [ "lektor_nkdb", "lektor_payloads", "lektor_repo", + "lektor_search", "lektor_utils", "log", "rand", @@ -4176,9 +4190,9 @@ dependencies = [ [[package]] name = "profiling" -version = "1.0.15" +version = "1.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43d84d1d7a6ac92673717f9f6d1518374ef257669c24ebc5ac25d5033828be58" +checksum = "afbdc74edc00b6f6a218ca6a5364d6226a259d4b8ea1af4a0ea063f27e179f4d" [[package]] name = "qoi" diff --git a/Cargo.toml b/Cargo.toml index 638ca35a..1e3e7846 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -64,10 +64,12 @@ lektor_mpris = { path = "lektor_mpris" } lektor_payloads = { path = "lektor_payloads" } lektor_procmacros = { path = "lektor_procmacros" } lektor_nkdb = { path = "lektor_nkdb" } +lektor_search = { path = "lektor_search" } # Data Structures hashbrown = { version = "*", features = ["serde"] } async-channel = { version = "*", default-features = false } +aho-corasick = { version = "*" } # Serialization & Deserialization toml = "*" diff --git a/amadeus/src/app/pages/search.rs b/amadeus/src/app/pages/search.rs index b427ed70..86d1f49a 100644 --- a/amadeus/src/app/pages/search.rs +++ b/amadeus/src/app/pages/search.rs @@ -48,8 +48,8 @@ impl FilterAtom { let (icon, text) = match &self.1 { KaraBy::Id(id) => (icon!(HASHTAG), id.to_string()), KaraBy::Query(query) => (icon!(FILTER), query.clone()), - KaraBy::Tag((name, None)) => (icon!(TAG), name.clone()), - KaraBy::Tag((name, Some(value))) => (icon!(TAGS), format!("{name}:{value}")), + KaraBy::Tag(name, None) => (icon!(TAG), name.clone()), + KaraBy::Tag(name, Some(value)) => (icon!(TAGS), format!("{name}:{value}")), KaraBy::SongType(song_type) => (icon!(HASHTAG), song_type.to_string()), KaraBy::SongOrigin(song_origin) => (icon!(HASHTAG), song_origin.to_string()), KaraBy::Author(author) => (icon!(USER), author.clone()), diff --git a/kurisu_api/Cargo.toml b/kurisu_api/Cargo.toml index 5c6996e5..a962735c 100644 --- a/kurisu_api/Cargo.toml +++ b/kurisu_api/Cargo.toml @@ -1,23 +1,24 @@ [package] -name = "kurisu_api" +name = "kurisu_api" description = "Crate used to deserialize what Kurisu returns" -rust-version.workspace = true -version.workspace = true -edition.workspace = true -authors.workspace = true -license.workspace = true +rust-version.workspace = true +version.workspace = true +edition.workspace = true +authors.workspace = true +license.workspace = true [lib] doctest = false [dependencies] -log.workspace = true -serde.workspace = true -sha256.workspace = true -hashbrown.workspace = true -derive_more.workspace = true -lektor_utils = { path = "../lektor_utils" } +log.workspace = true +serde.workspace = true +sha256.workspace = true +hashbrown.workspace = true +derive_more.workspace = true +lektor_utils.workspace = true +lektor_procmacros.workspace = true [dev-dependencies] serde_json.workspace = true diff --git a/kurisu_api/src/v2.rs b/kurisu_api/src/v2.rs index 35c9cae8..6fd2504b 100644 --- a/kurisu_api/src/v2.rs +++ b/kurisu_api/src/v2.rs @@ -3,6 +3,7 @@ use crate::{error::Error, SHA256}; use derive_more::Display; use hashbrown::{HashMap, HashSet}; +use lektor_procmacros::EnumVariantCount; use serde::{Deserialize, Serialize}; use std::{borrow, cmp, collections::BTreeSet, str::FromStr}; @@ -155,7 +156,9 @@ impl Infos { } /// The type of a song. One the the following, one per kara. -#[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Clone, Copy, Hash, Display)] +#[derive( + Debug, PartialEq, Eq, Serialize, Deserialize, Clone, Copy, Hash, Display, EnumVariantCount, +)] #[serde(rename_all = "UPPERCASE")] #[display("{}", self.as_str())] pub enum SongType { @@ -167,7 +170,9 @@ pub enum SongType { } /// The origin of a song's source. One the the following, one per kara. -#[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Clone, Copy, Hash, Display)] +#[derive( + Debug, PartialEq, Eq, Serialize, Deserialize, Clone, Copy, Hash, Display, EnumVariantCount, +)] #[serde(rename_all = "lowercase")] #[display("{}", self.as_str())] pub enum SongOrigin { diff --git a/lektor_nkdb/src/database/kara.rs b/lektor_nkdb/src/database/kara.rs index aeab4641..31747df3 100644 --- a/lektor_nkdb/src/database/kara.rs +++ b/lektor_nkdb/src/database/kara.rs @@ -99,15 +99,6 @@ impl Kara { ) } - /// Get the source/title string to use for regex match. - pub(crate) fn to_title_string(&self) -> String { - let mut ret = String::with_capacity(self.song_title.len() + self.song_source.len() + 3); - ret.push_str(&self.song_source.to_lowercase()); - ret.push_str(" / "); - ret.push_str(&self.song_title.to_lowercase()); - ret - } - pub const TAG_NUMBER: &str = "number"; pub const TAG_VERSION: &str = "version"; } diff --git a/lektor_nkdb/src/id.rs b/lektor_nkdb/src/id.rs index 0dea9e69..3fdc8a92 100644 --- a/lektor_nkdb/src/id.rs +++ b/lektor_nkdb/src/id.rs @@ -13,6 +13,12 @@ use std::{borrow, num, str::FromStr, sync::Arc}; #[display("{_0}")] pub struct KId(pub(crate) u64); +impl KId { + pub const fn from_u64(id: u64) -> Self { + Self(id) + } +} + impl PartialEq<KId> for u64 { fn eq(&self, other: &KId) -> bool { other.0 == *self @@ -35,7 +41,7 @@ impl FromStr for KId { impl From<u64> for KId { fn from(value: u64) -> Self { - Self(value) + Self::from_u64(value) } } diff --git a/lektor_nkdb/src/lib.rs b/lektor_nkdb/src/lib.rs index f9e94341..80e25d79 100644 --- a/lektor_nkdb/src/lib.rs +++ b/lektor_nkdb/src/lib.rs @@ -8,15 +8,11 @@ pub use crate::{ }, id::{KId, RemoteKId}, playlists::playlist::{Playlist, PlaylistInfo}, - search::{KaraBy, SearchFrom}, storage::{DatabaseDiskStorage, DatabaseStorage}, }; -pub use kurisu_api::v2::{SongOrigin, SongType}; +pub use kurisu_api::v2::{SongOrigin, SongType, SONGORIGIN_LENGTH, SONGTYPE_LENGTH}; -use crate::{ - database::{epoch::EpochData, pool::Pool}, - search::*, -}; +use crate::database::{epoch::EpochData, pool::Pool}; use anyhow::{anyhow, Context as _, Result}; use hashbrown::HashMap; use lektor_utils::pushvec::*; @@ -25,7 +21,6 @@ use playlists::{Playlists, PlaylistsHandle}; mod database; mod id; mod playlists; -mod search; mod storage; mod strings; diff --git a/lektor_nkdb/src/search/kara_by.rs b/lektor_nkdb/src/search/kara_by.rs deleted file mode 100644 index ae375cbc..00000000 --- a/lektor_nkdb/src/search/kara_by.rs +++ /dev/null @@ -1,144 +0,0 @@ -use crate::*; -use lektor_utils::either; -use regex::{Regex, RegexBuilder}; -use serde::{Deserialize, Serialize}; -use std::{borrow::Cow, convert::Infallible, fmt, str::FromStr}; - -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)] -pub enum KaraBy { - Id(u64), - Query(String), - Tag((String, Option<String>)), - SongType(SongType), - SongOrigin(SongOrigin), - Author(String), - Playlist(String), -} - -/// Get the index if the character is not an alphanumeric or space one. -fn non_alphanumspace_char((i, c): (usize, char)) -> Option<usize> { - (!(c.is_ascii_digit() || c.is_alphanumeric() || c.is_whitespace())).then_some(i) -} - -/// Trim a string in-place. -fn trim_in_place(value: &mut String) { - const SPACE: &[char] = &[' ', '\t', '\r', '\n']; - while value.starts_with(SPACE) { - value.remove(0); - } - while value.ends_with(SPACE) { - value.pop(); - } -} - -fn build_regex_for_cow(value: Cow<'_, str>) -> Result<Regex> { - let mut fuzzy = value.trim().replace(' ', r".+").to_lowercase(); - fuzzy.insert_str(0, r".*"); - fuzzy.push_str(r".*"); - Ok(RegexBuilder::new(&fuzzy) - .nest_limit(32) - .swap_greed(true) - .case_insensitive(false) - .build()?) -} - -impl fmt::Display for KaraBy { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let str: Cow<str> = self.into(); - f.write_str(&str) - } -} - -impl<'a> From<&'a KaraBy> for Cow<'a, str> { - fn from(value: &'a KaraBy) -> Self { - match value { - KaraBy::Id(id) => Cow::Owned(id.to_string()), - KaraBy::Tag((tag, Some(value))) => Cow::Owned(format!("{tag}:{value}")), - KaraBy::Tag((tag, None)) => Cow::Borrowed(tag.as_str()), - KaraBy::SongType(ty) => Cow::Borrowed(ty.as_str()), - KaraBy::SongOrigin(ori) => Cow::Borrowed(ori.as_str()), - - KaraBy::Author(str) | KaraBy::Playlist(str) | KaraBy::Query(str) => { - Cow::Borrowed(str.as_str()) - } - } - } -} - -impl From<String> for KaraBy { - fn from(mut value: String) -> Self { - trim_in_place(&mut value); - - if value.starts_with('@') { - value.remove(0); - trim_in_place(&mut value); - Self::Author(value) - } else if value.starts_with('#') { - value.remove(0); - trim_in_place(&mut value); - Self::Playlist(value) - } else if let Ok(value) = value.parse::<u64>() { - Self::Id(value) - } else if let Ok(value) = value.parse::<SongType>() { - Self::SongType(value) - } else if let Ok(value) = value.parse::<SongOrigin>() { - Self::SongOrigin(value) - } else if let Some((tag, value)) = value.split_once(':') { - if tag.is_empty() { - Self::Tag((value.trim().to_string(), None)) - } else { - let value = value.trim(); - let value = either!(value.is_empty() => None; Some(value.to_string())); - Self::Tag((tag.trim().to_string(), value)) - } - } else { - Self::Query(value) - } - } -} - -impl FromStr for KaraBy { - type Err = Infallible; - - fn from_str(value: &str) -> Result<Self, Self::Err> { - Ok(value.trim().to_string().into()) - } -} - -impl TryFrom<KaraBy> for Regex { - type Error = anyhow::Error; - - fn try_from(value: KaraBy) -> std::result::Result<Self, Self::Error> { - use KaraBy::*; - match &value { - Id(_) | SongType(_) | SongOrigin(_) => {} - regex @ Author(_) | regex @ Playlist(_) | regex @ Tag(_) | regex @ Query(_) => { - if let Some(idx) = Into::<Cow<'_, _>>::into(regex) - .char_indices() - .find_map(non_alphanumspace_char) - { - let regex = regex.to_string(); - anyhow::bail!("invalid char at index {idx} in regex: {regex}") - } - } - } - - build_regex_for_cow(Into::<Cow<str>>::into(&value)) - } -} - -impl TryFrom<KaraBy> for SearchBy { - type Error = anyhow::Error; - - fn try_from(value: KaraBy) -> std::result::Result<Self, Self::Error> { - match value { - KaraBy::Query(query) => build_regex_for_cow(Cow::Owned(query)).map(SearchBy::Query), - KaraBy::Id(id) => Ok(SearchBy::Id(id)), - KaraBy::Tag((tag, value)) => Ok(SearchBy::Tag((tag, value))), - KaraBy::SongType(ty) => Ok(SearchBy::SongType(ty)), - KaraBy::SongOrigin(ori) => Ok(SearchBy::SongOrigin(ori)), - KaraBy::Author(auth) => Ok(SearchBy::Author(auth)), - KaraBy::Playlist(plt) => Ok(SearchBy::Playlist(plt)), - } - } -} diff --git a/lektor_nkdb/src/search/mod.rs b/lektor_nkdb/src/search/mod.rs deleted file mode 100644 index 6c7b416b..00000000 --- a/lektor_nkdb/src/search/mod.rs +++ /dev/null @@ -1,83 +0,0 @@ -//! Utilities to search the database, the playlists, the history, the queue, etc, in a single -//! consistent way. - -mod kara_by; - -pub use kara_by::*; - -use crate::{KId, Kara}; -use anyhow::Result; -use kurisu_api::v2::{SongOrigin, SongType}; -use regex::Regex; -use serde::{Deserialize, Serialize}; - -/// Structure to tell from which KId set we are searching. -#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq)] -pub enum SearchFrom { - Queue, - Database, - History, - Playlist(KId), -} - -/// Structure used to tell how to do the search, either by a regex, or by applying another way -/// (author, tag, etc), or a list (intersection) of multiple things. -#[derive(Debug, Clone)] -pub enum SearchBy { - Query(Regex), - Id(u64), - Tag((String, Option<String>)), - SongType(SongType), - SongOrigin(SongOrigin), - Author(String), - Playlist(String), - Multiple(Vec<SearchBy>), -} - -impl FromIterator<SearchBy> for SearchBy { - fn from_iter<T: IntoIterator<Item = SearchBy>>(iter: T) -> Self { - SearchBy::Multiple(iter.into_iter().collect()) - } -} - -impl SearchBy { - pub(crate) fn new(regex: KaraBy) -> Result<Self> { - regex.try_into() - } - - /// Get the list of playlist that are needed for the kara to match. This is the only - /// informations that is not present in the epoch and thus need to be handled differently... - pub(crate) fn into_needed_playlists(self) -> Vec<String> { - match self { - SearchBy::Playlist(plt) => vec![plt], - SearchBy::Multiple(searches) => searches - .into_iter() - .flat_map(Self::into_needed_playlists) - .collect(), - _ => vec![], - } - } - - /// A match function. - pub(crate) fn matches(&self, kara: &Kara) -> bool { - match &self { - SearchBy::Query(regex) => regex.is_match(&kara.to_title_string()), - SearchBy::Id(id) => kara.id == *id, - SearchBy::SongType(ty) => kara.song_type.eq(ty), - SearchBy::SongOrigin(ori) => kara.song_origin.eq(ori), - SearchBy::Author(author) => kara.kara_makers.contains(author.as_str()), - SearchBy::Tag((key, None)) => kara.tags.contains_key(key.as_str()), - SearchBy::Tag((key, Some(value))) => kara - .tags - .get(key.as_str()) - .map(|v| v.iter().any(|v| v.as_ref().eq(value.as_str()))) - .unwrap_or_default(), - - // Recursive thing to apply multiple filters. - SearchBy::Multiple(filters) => filters.iter().all(|filter| filter.matches(kara)), - - // Handled after... - SearchBy::Playlist(_) => true, - } - } -} diff --git a/lektor_payloads/src/filter.rs b/lektor_payloads/src/filter.rs new file mode 100644 index 00000000..3d91d343 --- /dev/null +++ b/lektor_payloads/src/filter.rs @@ -0,0 +1,16 @@ +use lektor_nkdb::KId; +use serde::{Deserialize, Serialize}; + +/// Add to something (playlist/queue/...), or remove something. Some times we can decide to shuffle +/// the set of kara/the playlist before adding it. For the removing the shuffle flag is ignored. +#[derive(Debug, Serialize, Deserialize)] +pub enum KaraFilter { + /// A single kara. + KId(KId), + + /// A set of karas. + List(bool, Vec<KId>), + + /// The content of a playlist. + Playlist(bool, KId), +} diff --git a/lektor_payloads/src/lib.rs b/lektor_payloads/src/lib.rs index 8825f98a..56cf7eaa 100644 --- a/lektor_payloads/src/lib.rs +++ b/lektor_payloads/src/lib.rs @@ -1,6 +1,7 @@ //! Crate containing structs/enums that are used as payloads to communicate with the lektord //! daemon. Some things are re-exports. +mod filter; mod play_state; mod priority; mod range; @@ -8,15 +9,16 @@ mod search; mod userid; pub use crate::{ + filter::*, + play_state::*, priority::{Priority, PRIORITY_LENGTH, PRIORITY_VALUES}, - play_state::PlayState, range::*, search::*, userid::LektorUser, }; pub use lektor_nkdb::{ - KId, Kara, KaraBy, KaraStatus, KaraTimeStamps, Playlist, PlaylistInfo, RemoteKId, SearchFrom, - SongOrigin, SongType, + KId, Kara, KaraStatus, KaraTimeStamps, Playlist, PlaylistInfo, RemoteKId, SongOrigin, SongType, + SONGORIGIN_LENGTH, SONGTYPE_LENGTH, }; use anyhow::{anyhow, ensure}; diff --git a/lektor_payloads/src/search.rs b/lektor_payloads/src/search.rs index 4468c995..11b4d485 100644 --- a/lektor_payloads/src/search.rs +++ b/lektor_payloads/src/search.rs @@ -1,5 +1,50 @@ -use crate::*; +use anyhow::Result; +use lektor_nkdb::{KId, SongOrigin, SongType}; use serde::{Deserialize, Serialize}; +use std::{borrow::Cow, convert::Infallible, fmt, str::FromStr}; + +/// Structure used to tell how to do the search, either by a regex, or by applying another way +/// (author, tag, etc), or a list (intersection) of multiple things. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)] +pub enum KaraBy { + /// A kara with a particular Id. + Id(u64), + + /// Query by strings. + Query(String), + + /// A tag that contains or not a specified string. If the second element is [Some], one of the + /// values of the tag must contains the string, if [None] then the tag must just be present. + Tag(String, Option<String>), + + /// Karas with a specific [SongType] + SongType(SongType), + + /// Karas with a specific [SongOrigin] + SongOrigin(SongOrigin), + + /// Karas made by a specific author. + Author(String), + + /// Karas that are contained in a specific playlist, by its name. + Playlist(String), +} + +/// Structure to tell from which KId set we are searching. +#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq)] +pub enum SearchFrom { + /// Search from the queue. + Queue, + + /// Search from the whole database. + Database, + + /// Search from the history. + History, + + /// Search from a specific playlist. + Playlist(KId), +} #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)] pub struct SearchData { @@ -7,18 +52,91 @@ pub struct SearchData { pub regex: Vec<KaraBy>, } -/// Add to something (playlist/queue/...), or remove something. Some times we can decide to shuffle -/// the set of kara/the playlist before adding it. For the removing the shuffle flag is ignored. -#[derive(Debug, Serialize, Deserialize)] -pub enum KaraFilter { - /// A single kara. - KId(KId), +/// Trim a string and return it. +fn take_and_trim(mut value: String) -> String { + trim_in_place(&mut value); + value +} + +/// Trim a string in-place. +fn trim_in_place(value: &mut String) { + const SPACE: &[char] = &[' ', '\t', '\r', '\n']; + while value.starts_with(SPACE) { + value.remove(0); + } + while value.ends_with(SPACE) { + value.pop(); + } +} + +impl fmt::Display for KaraBy { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(Cow::from(self).as_ref()) + } +} + +impl<'a> From<&'a KaraBy> for Cow<'a, str> { + fn from(value: &'a KaraBy) -> Self { + match value { + KaraBy::Id(id) => Cow::Owned(id.to_string()), + + KaraBy::Tag(tag, Some(value)) => Cow::Owned(format!("{tag}:{value}")), + KaraBy::Tag(tag, None) => Cow::Borrowed(tag.as_str()), + + KaraBy::SongType(ty) => Cow::Borrowed(ty.as_str()), + KaraBy::SongOrigin(ori) => Cow::Borrowed(ori.as_str()), + KaraBy::Author(str) | KaraBy::Playlist(str) | KaraBy::Query(str) => { + Cow::Borrowed(str.as_str()) + } + } + } +} + +impl From<String> for KaraBy { + fn from(mut value: String) -> Self { + trim_in_place(&mut value); + + if value.starts_with('@') { + value.remove(0); + return Self::Author(take_and_trim(value)); + } - /// A set of karas. - List(bool, Vec<KId>), + if value.starts_with('#') { + value.remove(0); + return Self::Playlist(take_and_trim(value)); + } - /// The content of a playlist. - Playlist(bool, KId), + if let Ok(value) = value.parse::<u64>() { + return Self::Id(value); + } + + if let Ok(value) = value.parse::<SongType>() { + return Self::SongType(value); + } + + if let Ok(value) = value.parse::<SongOrigin>() { + return Self::SongOrigin(value); + } + + let Some(idx) = value.find(':') else { + return Self::Query(value); + }; + + let tag = take_and_trim(value.split_off(idx)); + let value = take_and_trim(value); + match tag.is_empty() { + true => Self::Tag(value, None), + false => Self::Tag(tag, (!value.trim().is_empty()).then_some(value)), + } + } +} + +impl FromStr for KaraBy { + type Err = Infallible; + + fn from_str(value: &str) -> Result<Self, Self::Err> { + Ok(value.trim().to_string().into()) + } } #[cfg(test)] @@ -33,8 +151,7 @@ mod test { fn assert_serde<T: Serialize + for<'de> Deserialize<'de> + std::fmt::Debug + std::cmp::Eq>( obj: T, ) -> Result<()> { - let res = serde_json::from_str(&serde_json::to_string(&obj)?)?; - assert_eq!(obj, res); + assert_eq!(obj, serde_json::from_str(&serde_json::to_string(&obj)?)?); Ok(()) } @@ -45,7 +162,7 @@ mod test { regex: vec![KaraBy::Id(42)], })?; assert_serde(SearchData { - from: SearchFrom::Playlist("jibun".parse().unwrap()), + from: SearchFrom::Playlist("1".parse().unwrap()), regex: vec![KaraBy::Query("Chicka".to_string())], })?; diff --git a/lektor_search/Cargo.toml b/lektor_search/Cargo.toml new file mode 100644 index 00000000..153ae2af --- /dev/null +++ b/lektor_search/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "lektor_search" +description = "Search traits, functions, utilities, to search the database and the queue for matching karas." + +rust-version.workspace = true +version.workspace = true +edition.workspace = true +authors.workspace = true +license.workspace = true + +[dependencies] +lektor_payloads.workspace = true +futures.workspace = true +hashbrown.workspace = true +aho-corasick.workspace = true +log.workspace = true diff --git a/lektor_search/src/batch.rs b/lektor_search/src/batch.rs new file mode 100644 index 00000000..bfc7a10b --- /dev/null +++ b/lektor_search/src/batch.rs @@ -0,0 +1,204 @@ +use std::mem::MaybeUninit; + +/// A batch of elements containing at most [Batch::SIZE] elements. It implements [Iterator], so you +/// can pull elements out of the batch to proccess them. +/// +/// The [Batch::Item] must be [Copy], so that the manipulation is easier, we don't need to hand the +/// drop thing, etc. +#[derive(Clone, Copy)] +pub struct Batch<const SIZE: usize, Item: Copy> { + /// The content of the batch. Items after the `current + count` position won't be initialized. + content: [MaybeUninit<Item>; SIZE], + + /// The base index in content. + current: usize, + + /// The number of KIds in the batch. Note that len is always inferior to the const + /// SIZE parameter. + count: usize, +} + +impl<const SIZE: usize, Item: Copy> Batch<SIZE, Item> { + /// Create a new batch from a suze and a content. + /// + /// # Safety + /// The passed size must correspond to the number of initialized values at the begin of the + /// passed array. + const unsafe fn new(count: usize, content: [MaybeUninit<Item>; SIZE]) -> Self { + Self { + current: 0, + content, + count, + } + } + + /// Get the capacity of the batch. + pub const fn capacity() -> usize { + SIZE + } + + /// Create a complete batch out of an array of items. + pub const fn from_array(value: [Item; SIZE]) -> Self { + let mut content = [MaybeUninit::<Item>::uninit(); SIZE]; + let mut i = 0; + while i < SIZE { + content[i] = MaybeUninit::new(value[i]); + i += 1; + } + unsafe { Self::new(SIZE, content) } + } + + /// Create a batch out of items. The batch won't be complete. + pub const fn from_array_maybe(value: [Option<Item>; SIZE]) -> Self { + let mut i: usize = 0; + let mut count: usize = 0; + while i < SIZE { + count += match value[i] { + Some(_) => 1, + None => 0, + }; + i += 1; + } + + let mut i: usize = 0; + let mut content = [MaybeUninit::<Item>::uninit(); SIZE]; + while i < count { + content[i] = match value[i] { + Some(id) => MaybeUninit::new(id), + None => unreachable!(), + }; + i += 1; + } + + unsafe { Self::new(count, content) } + } + + /// Move things out of the batch and returns an array of the items. + pub fn into_array(self) -> [Option<Item>; SIZE] { + let mut ret: [Option<Item>; SIZE] = [None; SIZE]; + (self.content.into_iter()) + .skip(self.current) + .take(self.count) + .enumerate() + .for_each(|(idx, id)| ret[idx] = Some(unsafe { id.assume_init() })); + ret + } +} + +impl<const SIZE: usize, Item: Copy> From<[Option<Item>; SIZE]> for Batch<SIZE, Item> { + fn from(value: [Option<Item>; SIZE]) -> Self { + Self::from_array_maybe(value) + } +} + +impl<const SIZE: usize, Item: Copy> From<[Item; SIZE]> for Batch<SIZE, Item> { + fn from(value: [Item; SIZE]) -> Self { + Self::from_array(value) + } +} + +impl<const SIZE: usize, Item: Copy> Iterator for Batch<SIZE, Item> { + type Item = Item; + + fn next(&mut self) -> Option<Self::Item> { + debug_assert!(self.count + self.current <= SIZE, "invalid size..."); + (self.count != 0).then(|| { + let ret = self.content[self.current]; + self.current += 1; + self.count -= 1; + unsafe { ret.assume_init() } + }) + } + + fn nth(&mut self, n: usize) -> Option<Self::Item> { + debug_assert!(self.count + self.current <= SIZE, "invalid size..."); + (self.count != 0 && n < self.count).then(|| { + let ret = self.content[n + self.current]; + self.current += n + 1; + self.count -= n + 1; + unsafe { ret.assume_init() } + }) + } + + fn last(self) -> Option<Self::Item> { + debug_assert!(self.count + self.current <= SIZE, "invalid size..."); + (self.count != 0).then(|| { + let ret = self.content[self.current + self.count - 1]; + unsafe { ret.assume_init() } + }) + } + + fn count(self) -> usize { + debug_assert!(self.count + self.current <= SIZE, "invalid size..."); + self.count + } + + fn size_hint(&self) -> (usize, Option<usize>) { + debug_assert!(self.count + self.current <= SIZE, "invalid size..."); + (self.count, Some(self.count)) + } +} + +#[test] +#[allow(clippy::iter_nth_zero)] +fn test_batch() { + const BATCH_01: Batch<10, u64> = Batch::from_array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]); + + assert_eq!(BATCH_01.last(), Some(9)); + assert_eq!(BATCH_01.clone().nth(9), Some(9)); + assert_eq!(BATCH_01.clone().nth(10), None); + + let mut batch = BATCH_01; + assert_eq!(batch.size_hint().0, 10); + assert_eq!(batch.size_hint().1, Some(10)); + assert_eq!(batch.count(), 10); + assert_eq!(batch.nth(9), Some(9)); + assert_eq!(batch.count(), 0); + assert_eq!(batch.nth(0), None); + assert_eq!(batch.last(), None); + + let mut batch = BATCH_01; + assert_eq!(batch.nth(0), Some(0)); + assert_eq!(batch.nth(0), Some(1)); + assert_eq!(batch.nth(0), Some(2)); + assert_eq!(batch.nth(0), Some(3)); + assert_eq!(batch.count(), 6); + assert_eq!(batch.last(), Some(9)); + + let mut batch = BATCH_01.enumerate(); + for (i, id) in batch.by_ref() { + assert_eq!(i as u64, id); + assert!(i < 10); + } + assert_eq!(batch.next(), None); + + const BATCH_02: Batch<10, u64> = Batch::from_array_maybe([ + Some(0), + None, + None, + None, + None, + None, + None, + None, + None, + None, + ]); + + assert_eq!(BATCH_02.count(), 1); + assert_eq!( + BATCH_02.into_array(), + [ + Some(0), + None, + None, + None, + None, + None, + None, + None, + None, + None, + ] + ) +} diff --git a/lektor_search/src/lib.rs b/lektor_search/src/lib.rs new file mode 100644 index 00000000..5c7878a4 --- /dev/null +++ b/lektor_search/src/lib.rs @@ -0,0 +1,45 @@ +mod batch; +mod search; +mod traits; + +use futures::{prelude::*, stream::FuturesUnordered}; +use lektor_payloads::{Kara, KaraBy}; + +pub use crate::{batch::*, traits::*}; + +/// Search a subset of a database that implements [KaraStore]. The subset must implement +/// [KaraIdExtractor]. The search is performed by doing the intersection of all the [KaraBy]. +pub async fn search<const BATCH_SIZE: usize>( + store: &impl KaraStore, + extractor: impl KaraIdExtractor, + search: Vec<KaraBy>, +) -> Vec<&Kara> { + let Some(search) = search::Search::new(search) else { + return Default::default(); + }; + + stream::unfold(extractor, |state| async move { + if state.is_empty().await { + return None; + } + Some((state.next_id_batch::<BATCH_SIZE>().await, state)) // Get the chuncks of IDs + }) + // + // Get the karas out of the ids + // + .then(|ids| store.get_kara_batch(ids)) + .map(Batch::into_array) + // + // Filter karas only if they are matched by the search + // + .map(|karas| karas.map(|maybe| maybe.and_then(|kara| search.matches_and_map(kara)))) + // + // Await the thing, build the return vector. + // + .collect::<FuturesUnordered<_>>() + .await + .into_iter() + .flatten() + .flatten() + .collect() +} diff --git a/lektor_search/src/search.rs b/lektor_search/src/search.rs new file mode 100644 index 00000000..9f8a8d0d --- /dev/null +++ b/lektor_search/src/search.rs @@ -0,0 +1,194 @@ +use aho_corasick::{AhoCorasick, AhoCorasickBuilder}; +use hashbrown::{HashMap, HashSet}; +use lektor_payloads::{ + KId, Kara, KaraBy, SongOrigin, SongType, SONGORIGIN_LENGTH, SONGTYPE_LENGTH, +}; + +/// To see if a kara matches. +#[derive(Default)] +pub(crate) struct Search { + only_id: Option<KId>, + tag_has_value: HashMap<String, Vec<String>>, + made_by_authors: Vec<String>, + present_in_playlists: Vec<String>, + queries: Option<AhoCorasick>, + types: [Option<SongType>; SONGTYPE_LENGTH], + origins: [Option<SongOrigin>; SONGORIGIN_LENGTH], +} + +#[derive(Default)] +struct SearchBuilder { + only_id: Option<KId>, + has_tag: Vec<String>, + tag_has_value: Vec<(String, String)>, + query: Vec<String>, + present_in_playlists: Vec<String>, + made_by: Vec<String>, + + union_types: Vec<SongType>, + union_origin: Vec<SongOrigin>, +} + +impl Search { + /// Create the search, precompute filters. + pub fn new(content: Vec<KaraBy>) -> Option<Self> { + SearchBuilder::new(content).build() + } + + /// See if we matches a kara or not. + fn matches(&self, kara: &Kara) -> bool { + macro_rules! ensure { + ($expr:expr) => {{ + if !($expr) { + return false; + } + }}; + } + + if let Some(id) = self.only_id { + ensure!(kara.id == id); + } + + if !self.types.is_empty() { + ensure!((self.types.iter().flatten()).any(|ty| kara.song_type == *ty)); + } + + if !self.origins.is_empty() { + ensure!((self.origins.iter().flatten()).any(|origin| kara.song_origin == *origin)); + } + + if !self.made_by_authors.is_empty() { + let lowercase: HashSet<String> = (kara.kara_makers.iter()) + .map(|author| author.to_lowercase()) + .collect(); + ensure!((self.made_by_authors.iter()).any(|author| lowercase.contains(author))); + } + + if !self.present_in_playlists.is_empty() { + log::error!("implement the present in playlist check"); + return false; + } + + if let Some(queries) = self.queries.as_ref() { + ensure!(queries.is_match(&kara.song_title) || queries.is_match(&kara.song_source)); + } + + if !self.tag_has_value.is_empty() { + log::error!("implement the tag search thing"); + return false; + } + + true + } + + /// If we match a kara (see [Self::matches]), then we return said kara. + pub fn matches_and_map<'a>(&self, kara: &'a Kara) -> Option<&'a Kara> { + self.matches(kara).then_some(kara) + } + + fn with_queries(self, queries: Vec<String>) -> Option<Self> { + let queries = AhoCorasickBuilder::new() + .ascii_case_insensitive(true) + .prefilter(true) + .build(queries) + .map_err(|err| log::error!("{err}")) + .inspect(|aho| { + log::info!( + "memory usage for the query automaton: {} bytes", + aho.memory_usage() + ) + }) + .ok()?; + Some(Self { + queries: Some(queries), + ..self + }) + } + + fn with_tags(mut self, tags: Vec<String>) -> Self { + tags.into_iter() + .for_each(|tag| _ = self.tag_has_value.entry(tag).or_default()); + self + } + + fn with_tag_with_values(mut self, tags_n_values: Vec<(String, String)>) -> Self { + tags_n_values.into_iter().for_each(|(tag, value)| { + let values = self.tag_has_value.entry(tag).or_default(); + if !values.contains(&value) { + values.push(value); + } + }); + self + } + + fn with_types(self, tys: Vec<SongType>) -> Self { + tys.into_iter().fold(self, |this, ty| this.with_type(ty)) + } + + fn with_origins(self, origines: Vec<SongOrigin>) -> Self { + origines + .into_iter() + .fold(self, |this, origin| this.with_origin(origin)) + } + + fn with_type(mut self, ty: SongType) -> Self { + self.types.iter_mut().find_map(|stored| match stored { + Some(stored) if *stored != ty => None, // Not this one, we try the next item. + Some(_) => Some(()), // Already present, we return. + None => { + // Not found, we insert it. + *stored = Some(ty); + Some(()) + } + }); + self + } + + fn with_origin(mut self, origin: SongOrigin) -> Self { + self.origins.iter_mut().find_map(|stored| match stored { + Some(stored) if *stored != origin => None, // Not this one, we try the next item. + Some(_) => Some(()), // Already present, we return. + None => { + // Not found, we insert it. + *stored = Some(origin); + Some(()) + } + }); + self + } +} + +impl SearchBuilder { + fn new(content: Vec<KaraBy>) -> Self { + (content.into_iter()).fold(SearchBuilder::default(), |mut ret, kara_by| { + match kara_by { + KaraBy::Id(id) => ret.only_id = ret.only_id.or(Some(id.into())), + KaraBy::Playlist(name) => ret.present_in_playlists.push(name), + KaraBy::Query(query) => ret.query.push(query), + KaraBy::Tag(tag, None) => ret.has_tag.push(tag), + KaraBy::Tag(tag, Some(value)) => ret.tag_has_value.push((tag, value)), + KaraBy::Author(author) => ret.made_by.push(author), + KaraBy::SongType(tiipe) => ret.union_types.push(tiipe), + KaraBy::SongOrigin(origin) => ret.union_origin.push(origin), + } + ret + }) + } + + fn build(mut self) -> Option<Search> { + log::error!("make_titlecase instead of make_ascii_lowercase"); + (self.made_by.iter_mut()).for_each(|str| str.make_ascii_lowercase()); + (self.present_in_playlists.iter_mut()).for_each(|str| str.make_ascii_lowercase()); + Search { + only_id: self.only_id, + made_by_authors: self.made_by, + present_in_playlists: self.present_in_playlists, + ..Default::default() + } + .with_origins(self.union_origin) + .with_types(self.union_types) + .with_tags(self.has_tag) + .with_tag_with_values(self.tag_has_value) + .with_queries(self.query) + } +} diff --git a/lektor_search/src/traits.rs b/lektor_search/src/traits.rs new file mode 100644 index 00000000..b1fa5a02 --- /dev/null +++ b/lektor_search/src/traits.rs @@ -0,0 +1,37 @@ +use crate::batch::Batch; +use lektor_payloads::{KId, Kara}; + +#[allow(async_fn_in_trait)] +pub trait KaraIdExtractor { + /// Get the next kara id. + async fn next_id(&self) -> Option<KId>; + + /// Get a next batch of kara id, to reduce any lock usage. + async fn next_id_batch<const SIZE: usize>(&self) -> Batch<SIZE, KId>; + + /// Get the number of karas to process until the extractor is empty. + async fn count(&self) -> usize; + + /// Tells if the extractor is empty or not. + async fn is_empty(&self) -> bool { + self.count().await == 0 + } +} + +#[allow(async_fn_in_trait)] +pub trait KaraStore { + /// Get a kara by its [KId]. + async fn get_kara(&self, id: KId) -> Option<&Kara>; + + /// Get a list of kara, by their [KId], we use [KIdBatch] to know the size here... + async fn get_kara_batch<const SIZE: usize>( + &self, + batch: Batch<SIZE, KId>, + ) -> Batch<SIZE, &Kara> { + let mut ret = [None; SIZE]; + for (idx, id) in batch.into_array().into_iter().flatten().enumerate() { + ret[idx] = self.get_kara(id).await; + } + Batch::<SIZE, &Kara>::from_array_maybe(ret) + } +} diff --git a/lektord/Cargo.toml b/lektord/Cargo.toml index 9b51010e..84f35257 100644 --- a/lektord/Cargo.toml +++ b/lektord/Cargo.toml @@ -31,7 +31,8 @@ lektor_nkdb = { path = "../lektor_nkdb" } lektor_repo = { path = "../lektor_repo" } lektor_utils = { path = "../lektor_utils" } lektor_mpris = { path = "../lektor_mpris" } -lektor_payloads = { path = "../lektor_payloads" } +lektor_payloads.workspace = true +lektor_search.workspace = true [build-dependencies] anyhow.workspace = true -- GitLab