diff --git a/lektor_nkdb/src/database/epoch.rs b/lektor_nkdb/src/database/epoch.rs index 80b4cc15155c29dae5434830806676ee7dfe9e84..dc9044f23da8148a9e1d70197dd19144457ec058 100644 --- a/lektor_nkdb/src/database/epoch.rs +++ b/lektor_nkdb/src/database/epoch.rs @@ -30,6 +30,14 @@ impl Epoch { self.0.values().find(|kara| id.eq(&kara.id.local_id())) } + /// Get the list of [Kara] corresponding to the [KId]. If we failed to find one kara we log the + /// error and continue (silent fail). + pub fn get_karas_by_kid(&self, ids: impl IntoIterator<Item = KId>) -> Vec<&Kara> { + ids.into_iter() + .flat_map(|id| self.get_kara_by_kid(id)) + .collect() + } + /// Get a [Kara] by its local id [KId]. Should be more efficient that the [get_kara_by_u64] /// function because we don't iterate in the [HashMap] but we directly hash the string and get /// the bucket where the object is stored. diff --git a/lektor_nkdb/src/database/kara.rs b/lektor_nkdb/src/database/kara.rs index b6bc76c6048f3f7772d67b2333e996c18edd3636..79f3f3e605ea5a880d099d602b89edad03adece6 100644 --- a/lektor_nkdb/src/database/kara.rs +++ b/lektor_nkdb/src/database/kara.rs @@ -90,6 +90,15 @@ impl Kara { (Physical { hash: h1, .. }, Physical { hash: h2, .. }) if h1.eq(h2) ) } + + /// Get the source/title string to use for regex match. + pub(crate) fn to_title_string(&self) -> String { + let mut ret = String::with_capacity(self.song_title.len() + self.song_source.len() + 3); + ret.push_str(&self.song_source.to_lowercase()); + ret.push_str(" / "); + ret.push_str(&self.song_title.to_lowercase()); + ret + } } impl std::fmt::Display for Kara { diff --git a/lektor_nkdb/src/database/pool.rs b/lektor_nkdb/src/database/pool.rs index 0c42c6272617f37052a386e29d064c297a413cb6..ac76b200b3d8182bb57d92e94b74d6153bc9a060 100644 --- a/lektor_nkdb/src/database/pool.rs +++ b/lektor_nkdb/src/database/pool.rs @@ -4,7 +4,7 @@ //! representation for searching plus some mapping. A pool is common for all the epochs. For epoch //! specific things like the [u64] to [Kid] / [Kara] mappings, do that in the epoch struct. -use crate::{Kara, Playlist}; +use crate::Playlist; use hashbrown::HashMap; use serde::{Deserialize, Serialize}; use std::{collections::hash_map::DefaultHasher, hash::Hasher, sync::Arc}; @@ -20,9 +20,6 @@ pub(crate) struct Pool { /// The mapping for remote id <-> local id id_mapping: RwLock<Vec<(KId, RemoteKId)>>, - - /// Caching hashmap for kara strings for latter matches. - kara_string_cache: RwLock<HashMap<KId, Arc<str>>>, } impl Pool { @@ -52,13 +49,6 @@ impl Pool { (id.cloned(), rkid) } - /// Get the list of cached strings with their associated ids. - pub(crate) async fn get_cached_strings(&self, kids: Vec<KId>) -> Vec<(KId, Arc<str>)> { - let cache = self.kara_string_cache.read().await; - let get_entry = |kid| cache.get(&kid).map(|value| (kid, value.clone())); - kids.into_iter().flat_map(get_entry).collect() - } - /// Get a pointer to the string by its value. Used to reduce the amount of memory allocated for /// string representation. The heuristic is that we can share the song's origins and the tag /// keys and values. This is the sync version where we don't need async because we have a mut @@ -87,20 +77,6 @@ impl Pool { .into() } - /// Cache a kara's string representation for searches. We index the string by the id of the - /// kara so we can query it from queue, playlists, etc. - pub(crate) async fn cache_kara_string(&self, kara: &Kara) { - let (id, str) = (kara.id.clone(), kara.to_string().to_lowercase().into()); - self.kara_string_cache.write().await.insert(id, str); - } - - /// Same as [Self::cache_kara_string] but the sync version where we don't need to aquire the - /// lock in an async way. - pub(crate) fn cache_kara_string_sync(&mut self, kara: &Kara) { - let (id, str) = (kara.id.clone(), kara.to_string().to_lowercase().into()); - self.kara_string_cache.get_mut().insert(id, str); - } - /// Get the maximal id present in the pool. pub(crate) async fn maximal_id(&self) -> u64 { let list = self.id_mapping.read().await; @@ -131,7 +107,6 @@ impl Pool { let values = values.into_iter().map(|v| self.get_str_sync(v)); (key, values.collect()) })); - self.cache_kara_string_sync(&kara); (self.get_str_sync(kid.0), kara) }); let _ = std::mem::replace(data, EpochData::from_iter(content)); diff --git a/lektor_nkdb/src/database/update.rs b/lektor_nkdb/src/database/update.rs index 4fa642af0e0ca9f55aa3e884a4dde0795bc4c862..eb9f9ab3319f3f47c49031b4d4778c5d996c143f 100644 --- a/lektor_nkdb/src/database/update.rs +++ b/lektor_nkdb/src/database/update.rs @@ -95,14 +95,13 @@ impl<'a, Storage: DatabaseStorage> UpdateHandler<'a, Storage> { .insert(kara.id.clone(), kara.clone()); Ok(None) }; - let doit = |kara: Kara, new_epoch: &'a RefCell<PushVecMutNode<Epoch>>, pool: &'a Pool| async { + let doit = |kara: Kara, new_epoch: &'a RefCell<PushVecMutNode<Epoch>>| async { let KaraStatus::Physical { hash, .. } = kara.kara_status else { log::warn!("tried to download a virtual kara: {kara}"); return reuse(kara, new_epoch); }; let (kid, rkid) = (kara.id.clone(), kara.remote.clone()); log::debug!("need to download kid {kid} / remote_kid {rkid}"); - pool.cache_kara_string(&kara).await; new_epoch .borrow_mut() .content() @@ -112,16 +111,16 @@ impl<'a, Storage: DatabaseStorage> UpdateHandler<'a, Storage> { }; match self.last_epoch { - None => doit(kara, &self.new_epoch, self.pool).await, // No way the kara was here + None => doit(kara, &self.new_epoch).await, // No way the kara was here Some(last_epoch) => match last_epoch.data().get(&kara.id) { Some(old_kara) if !kara.same_file_as(old_kara) => { - doit(kara, &self.new_epoch, self.pool).await // The file has changed + doit(kara, &self.new_epoch).await // The file has changed } - None => doit(kara, &self.new_epoch, self.pool).await, // Not present last time - Some(_) => reuse(kara, &self.new_epoch), // Present last time, but we use the new - // built kara because some informations - // might have been updated even if the - // epoch was not incremented. + None => doit(kara, &self.new_epoch).await, // Not present last time + Some(_) => reuse(kara, &self.new_epoch), // Present last time, but we use the new + // built kara because some informations + // might have been updated even if the + // epoch was not incremented. }, } } diff --git a/lektor_nkdb/src/lib.rs b/lektor_nkdb/src/lib.rs index 526808a1288cba8228e68584c9d4b9d3a373a1ee..7ad89a4e7340f0791f1cd66605473db60ca5e644 100644 --- a/lektor_nkdb/src/lib.rs +++ b/lektor_nkdb/src/lib.rs @@ -11,7 +11,11 @@ pub use crate::{ }; use crate::{database::*, queue::*, search::*}; -use anyhow::{anyhow, Context, Result}; +use anyhow::{anyhow, bail, Context, Result}; +use futures::{ + stream::{self, FuturesUnordered}, + StreamExt, +}; use hashbrown::HashMap; use lektor_utils::{log, pushvec::*}; use playlist::Playlists; @@ -152,13 +156,27 @@ impl<Storage: DatabaseStorage> Database<Storage> { None => vec![], }, }; - log::error!( - "do the search thing without cloning the cached string vector (len is {})...", - kids.len() - ); - let strings = self.pool.get_cached_strings(kids).await; - let matched_string = |(kid, string)| regex.matches(&string).then_some(kid); - Ok(strings.into_iter().filter_map(matched_string).collect()) + let mut karas: Vec<KId> = match self.last_epoch().await { + Some(epoch) => epoch + .get_karas_by_kid(kids) + .into_iter() + .filter_map(|kara| regex.matches(kara).then_some(kara.id.clone())) + .collect(), + None => bail!("no epoch to search kara from"), + }; + let plts = regex.into_needed_playlists(); + if !plts.is_empty() { + let filter: Vec<_> = stream::iter(plts.into_iter()) + .then(|name| self.playlists.get_content(name)) + .collect::<FuturesUnordered<_>>() + .await + .into_iter() + .flatten() + .flatten() + .collect(); + karas.retain(|kara_id| filter.contains(kara_id)); + } + Ok(karas) } /// Returns the kara count from the search set. diff --git a/lektor_nkdb/src/queue.rs b/lektor_nkdb/src/queue/mod.rs similarity index 80% rename from lektor_nkdb/src/queue.rs rename to lektor_nkdb/src/queue/mod.rs index 2ca8cdaa0d99e07c92e569c6ee9b9233716c12a3..dd81091a252822b56e4f28e9ca92b2b24cd1587d 100644 --- a/lektor_nkdb/src/queue.rs +++ b/lektor_nkdb/src/queue/mod.rs @@ -1,99 +1,16 @@ //! Store the state of the queue in memory. For now we don't store it on disk, see latter how we do //! that thing. +mod priority; + +pub use priority::*; + use crate::*; -use lektor_procmacros::{EnumVariantCount, EnumVariantIter}; use lektor_utils::{filter_range, log, BoundedBoundRange}; use rand::{seq::SliceRandom, thread_rng}; -use std::{ops::RangeInclusive, str::FromStr}; +use std::ops::RangeInclusive; use tokio::sync::RwLock; -/// Priorities to insert into the queue. We are one based because that's how humans thinks about -/// numbers... All the things are based around the fact that there is only 4 priorities. -#[derive( - Debug, - Default, - Clone, - Copy, - PartialEq, - PartialOrd, - Serialize, - Deserialize, - EnumVariantCount, - EnumVariantIter, -)] -pub enum Priority { - #[default] - Add = 1, - Suggest = 2, - Insert = 3, - Enforce = 4, -} - -impl FromStr for Priority { - type Err = String; - fn from_str(s: &str) -> Result<Self, Self::Err> { - match &s.to_lowercase()[..] { - "1" | "add" => Ok(Self::Add), - "2" | "suggest" => Ok(Self::Suggest), - "3" | "insert" => Ok(Self::Insert), - "4" | "enforce" => Ok(Self::Enforce), - _ => Err(format!("unknown add level for queue: {s}")), - } - } -} - -impl AsRef<str> for Priority { - fn as_ref(&self) -> &str { - self.as_str() - } -} - -impl Priority { - pub const fn as_str(&self) -> &str { - match self { - Priority::Add => "1", - Priority::Suggest => "2", - Priority::Insert => "3", - Priority::Enforce => "4", - } - } - - pub const fn index(&self) -> usize { - (*self as usize) - 1 - } -} - -impl std::fmt::Display for Priority { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.write_str(self.as_str()) - } -} - -macro_rules! impl_prio_from { - ($int_type: ident, $($nexts: ident),+) => { - impl_prio_from! { $int_type } - impl_prio_from! { $($nexts),+ } - }; - - ($int_type: ident) => { - impl From<Priority> for $int_type { fn from(value: Priority) -> Self { value as $int_type } } - - impl From<$int_type> for Priority { - fn from(value: $int_type) -> Self { match value { - 2 => Priority::Suggest, - 3 => Priority::Insert, - value if value >= 4 => Priority::Enforce, - _ => Priority::Add, - }} - } - }; -} - -impl_prio_from! { u8, u16, u32, u64, u128, usize - , i8, i16, i32, i64, i128, isize -} - /// The queue contains the playing kara and the following ones. This type is just a wrapper around /// the [QueueContent] with a [RwLock]. For function documentation see [QueueContent]. #[derive(Debug, Default)] @@ -384,22 +301,3 @@ impl_queue! { self.history.retain(|kid| id.ne(kid)) } } - -#[test] -fn test_priorities() { - use Priority::*; - let into = <Priority as Into<usize>>::into; - - assert!(Add < Enforce); - assert!(into(Add) < PRIORITY_LENGTH); - assert!(into(Suggest) < PRIORITY_LENGTH); - assert!(into(Insert) < PRIORITY_LENGTH); - assert!(into(Enforce) == PRIORITY_LENGTH); - - #[allow(dead_code)] - fn test_4_variants(prio: Priority) { - match prio { - Add | Suggest | Insert | Enforce => (), - } - } -} diff --git a/lektor_nkdb/src/queue/priority.rs b/lektor_nkdb/src/queue/priority.rs new file mode 100644 index 0000000000000000000000000000000000000000..b4aa903d7c13b1788d4d05c60a3a8e21f20635e8 --- /dev/null +++ b/lektor_nkdb/src/queue/priority.rs @@ -0,0 +1,108 @@ +use crate::*; +use lektor_procmacros::{EnumVariantCount, EnumVariantIter}; +use std::str::FromStr; + +/// Priorities to insert into the queue. We are one based because that's how humans thinks about +/// numbers... All the things are based around the fact that there is only 4 priorities. +#[derive( + Debug, + Default, + Clone, + Copy, + PartialEq, + PartialOrd, + Serialize, + Deserialize, + EnumVariantCount, + EnumVariantIter, +)] +pub enum Priority { + #[default] + Add = 1, + Suggest = 2, + Insert = 3, + Enforce = 4, +} + +impl FromStr for Priority { + type Err = String; + fn from_str(s: &str) -> Result<Self, Self::Err> { + match &s.to_lowercase()[..] { + "1" | "add" => Ok(Self::Add), + "2" | "suggest" => Ok(Self::Suggest), + "3" | "insert" => Ok(Self::Insert), + "4" | "enforce" => Ok(Self::Enforce), + _ => Err(format!("unknown add level for queue: {s}")), + } + } +} + +impl AsRef<str> for Priority { + fn as_ref(&self) -> &str { + self.as_str() + } +} + +impl Priority { + pub const fn as_str(&self) -> &str { + match self { + Priority::Add => "1", + Priority::Suggest => "2", + Priority::Insert => "3", + Priority::Enforce => "4", + } + } + + pub const fn index(&self) -> usize { + (*self as usize) - 1 + } +} + +impl std::fmt::Display for Priority { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(self.as_str()) + } +} + +macro_rules! impl_prio_from { + ($int_type: ident, $($nexts: ident),+) => { + impl_prio_from! { $int_type } + impl_prio_from! { $($nexts),+ } + }; + + ($int_type: ident) => { + impl From<Priority> for $int_type { fn from(value: Priority) -> Self { value as $int_type } } + + impl From<$int_type> for Priority { + fn from(value: $int_type) -> Self { match value { + 2 => Priority::Suggest, + 3 => Priority::Insert, + value if value >= 4 => Priority::Enforce, + _ => Priority::Add, + }} + } + }; +} + +impl_prio_from! { u8, u16, u32, u64, u128, usize + , i8, i16, i32, i64, i128, isize +} + +#[test] +fn test_priorities() { + use Priority::*; + let into = <Priority as Into<usize>>::into; + + assert!(Add < Enforce); + assert!(into(Add) < PRIORITY_LENGTH); + assert!(into(Suggest) < PRIORITY_LENGTH); + assert!(into(Insert) < PRIORITY_LENGTH); + assert!(into(Enforce) == PRIORITY_LENGTH); + + #[allow(dead_code)] + fn test_4_variants(prio: Priority) { + match prio { + Add | Suggest | Insert | Enforce => (), + } + } +} diff --git a/lektor_nkdb/src/search.rs b/lektor_nkdb/src/search/kara_by.rs similarity index 72% rename from lektor_nkdb/src/search.rs rename to lektor_nkdb/src/search/kara_by.rs index 32be1d3b3765b52d5559dbabb4f446a0cc051d19..f0fbf825eee59ac5cb384c53b56a74b28b885841 100644 --- a/lektor_nkdb/src/search.rs +++ b/lektor_nkdb/src/search/kara_by.rs @@ -1,16 +1,9 @@ -//! Utilities to search the database, the playlists, the history, the queue, etc, in a single -//! consistent way. - -use crate::{playlist::PlaylistName, *}; +use crate::*; use lektor_utils::either; use regex::{Regex, RegexBuilder}; use serde::{Deserialize, Serialize}; use std::{borrow::Cow, convert::Infallible, str::FromStr}; -/// Structure wrapping a regex to fuzzy search into the database or queue for matching karas. -#[derive(Debug)] -pub(crate) struct Search(Regex); - #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] pub enum KaraBy { Id(u64), @@ -22,6 +15,33 @@ pub enum KaraBy { Playlist(String), } +/// Get the index if the character is not an alphanumeric or space one. +fn non_alphanumspace_char((i, c): (usize, char)) -> Option<usize> { + (!(c.is_ascii_digit() || c.is_alphanumeric() || c.is_whitespace())).then_some(i) +} + +/// Trim a string in-place. +fn trim_in_place(value: &mut String) { + const SPACE: &[char] = &[' ', '\t', '\r', '\n']; + while value.starts_with(SPACE) { + value.remove(0); + } + while value.ends_with(SPACE) { + value.pop(); + } +} + +fn build_regex_for_cow(value: Cow<'_, str>) -> Result<Regex> { + let mut fuzzy = value.trim().replace(' ', r".+").to_lowercase(); + fuzzy.insert_str(0, r".*"); + fuzzy.push_str(r".*"); + Ok(RegexBuilder::new(&fuzzy) + .nest_limit(32) + .swap_greed(true) + .case_insensitive(false) + .build()?) +} + impl ToString for KaraBy { fn to_string(&self) -> String { let str: Cow<str> = self.into(); @@ -45,16 +65,6 @@ impl<'a> From<&'a KaraBy> for Cow<'a, str> { } } -fn trim_in_place(value: &mut String) { - const SPACE: &[char] = &[' ', '\t', '\r', '\n']; - while value.starts_with(SPACE) { - value.remove(0); - } - while value.ends_with(SPACE) { - value.pop(); - } -} - impl From<String> for KaraBy { fn from(mut value: String) -> Self { trim_in_place(&mut value); @@ -95,26 +105,12 @@ impl FromStr for KaraBy { } } -/// Structure to tell from which KId set we are searching. -#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq)] -pub enum SearchFrom { - Queue, - Database, - History, - Playlist(PlaylistName), -} - -/// Get the index if the character is not an alphanumeric or space one. -fn non_alphanumspace_char((i, c): (usize, char)) -> Option<usize> { - (!(c.is_ascii_digit() || c.is_alphanumeric() || c.is_whitespace())).then_some(i) -} +impl TryFrom<KaraBy> for Regex { + type Error = anyhow::Error; -impl Search { - /// Create a new search regex. We only accept alphanumeric stuff to avoid vulnerabilities - /// around regex from untrusted sources. We will do the fuzzy stuff latter. - pub fn new(regex: KaraBy) -> anyhow::Result<Self> { + fn try_from(value: KaraBy) -> std::result::Result<Self, Self::Error> { use KaraBy::*; - match ®ex { + match &value { Id(_) | SongType(_) | SongOrigin(_) => {} regex @ Author(_) | regex @ Playlist(_) | regex @ Tag(_) | regex @ Query(_) => { if let Some(idx) = Into::<Cow<'_, _>>::into(regex) @@ -126,23 +122,23 @@ impl Search { } } } - let mut fuzzy = Into::<Cow<str>>::into(®ex) - .trim() - .replace(' ', r".+") - .to_lowercase(); - fuzzy.insert_str(0, r".*"); - fuzzy.push_str(r".*"); - let regex = RegexBuilder::new(&fuzzy) - // .size_limit(128) - .nest_limit(32) - .swap_greed(true) - .case_insensitive(false) - .build()?; - Ok(Self(regex)) + + build_regex_for_cow(Into::<Cow<str>>::into(&value)) } +} + +impl TryFrom<KaraBy> for SearchBy { + type Error = anyhow::Error; - /// A match function. - pub fn matches(&self, kara: impl AsRef<str>) -> bool { - self.0.is_match(kara.as_ref()) + fn try_from(value: KaraBy) -> std::result::Result<Self, Self::Error> { + match value { + KaraBy::Query(query) => build_regex_for_cow(Cow::Owned(query)).map(SearchBy::Query), + KaraBy::Id(id) => Ok(SearchBy::Id(id)), + KaraBy::Tag((tag, value)) => Ok(SearchBy::Tag((tag, value))), + KaraBy::SongType(ty) => Ok(SearchBy::SongType(ty)), + KaraBy::SongOrigin(ori) => Ok(SearchBy::SongOrigin(ori)), + KaraBy::Author(auth) => Ok(SearchBy::Author(auth)), + KaraBy::Playlist(plt) => Ok(SearchBy::Playlist(plt)), + } } } diff --git a/lektor_nkdb/src/search/mod.rs b/lektor_nkdb/src/search/mod.rs new file mode 100644 index 0000000000000000000000000000000000000000..3aec9841de76eb3d2f3e6c045355eb262626c45c --- /dev/null +++ b/lektor_nkdb/src/search/mod.rs @@ -0,0 +1,91 @@ +//! Utilities to search the database, the playlists, the history, the queue, etc, in a single +//! consistent way. + +mod kara_by; + +pub use kara_by::*; + +use crate::{playlist::PlaylistName, Kara}; +use kurisu_api::v2::{SongOrigin, SongType}; +use regex::Regex; +use serde::{Deserialize, Serialize}; + +/// Structure wrapping a regex to fuzzy search into the database or queue for matching karas. +#[derive(Debug)] +pub(crate) struct Search(SearchBy); + +/// Structure to tell from which KId set we are searching. +#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq)] +pub enum SearchFrom { + Queue, + Database, + History, + Playlist(PlaylistName), +} + +/// Structure used to tell how to do the search, either by a regex, or by applying another way +/// (author, tag, etc), or a list (intersection) of multiple things. +#[derive(Debug, Clone)] +pub enum SearchBy { + Query(Regex), + Id(u64), + Tag((String, Option<String>)), + SongType(SongType), + SongOrigin(SongOrigin), + Author(String), + Playlist(String), + Multiple(Vec<SearchBy>), +} + +impl FromIterator<SearchBy> for SearchBy { + fn from_iter<T: IntoIterator<Item = SearchBy>>(iter: T) -> Self { + SearchBy::Multiple(iter.into_iter().collect()) + } +} + +impl SearchBy { + /// Get the list of playlist that are needed for the kara to match. This is the only + /// informations that is not present in the epoch and thus need to be handled differently... + pub(crate) fn into_needed_playlists(self) -> Vec<String> { + match self { + SearchBy::Playlist(plt) => vec![plt], + SearchBy::Multiple(searches) => searches + .into_iter() + .flat_map(Self::into_needed_playlists) + .collect(), + _ => vec![], + } + } +} + +impl Search { + /// Create a new search regex. We only accept alphanumeric stuff to avoid vulnerabilities + /// around regex from untrusted sources. We will do the fuzzy stuff latter. + pub(crate) fn new(regex: KaraBy) -> anyhow::Result<Self> { + Ok(Self(SearchBy::try_from(regex)?)) + } + + /// Get the list of playlist that are needed for the kara to match. This is the only + /// informations that is not present in the epoch and thus need to be handled differently... + pub(crate) fn into_needed_playlists(self) -> Vec<String> { + self.0.into_needed_playlists() + } + + /// A match function. + pub(crate) fn matches(&self, kara: &Kara) -> bool { + match &self.0 { + SearchBy::Query(regex) => regex.is_match(&kara.to_title_string()), + SearchBy::Id(id) => kara.id.local_id().eq(id), + SearchBy::SongType(ty) => kara.song_type.eq(ty), + SearchBy::SongOrigin(ori) => kara.song_origin.eq(ori), + + SearchBy::Author(_) => todo!(), + SearchBy::Tag(_) => todo!(), + + SearchBy::Multiple(_) => todo!(), + + // Handled after... + SearchBy::Playlist(_) => true, + } + } +}