No More Sort Prefixes (#3895)
This commit is contained in:
parent
9eadf956fb
commit
08c52b4281
16 changed files with 4095 additions and 3 deletions
101
API/Helpers/BookSortTitlePrefixHelper.cs
Normal file
101
API/Helpers/BookSortTitlePrefixHelper.cs
Normal file
|
|
@ -0,0 +1,101 @@
|
|||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Runtime.CompilerServices;
|
||||
|
||||
namespace API.Helpers;
|
||||
|
||||
/// <summary>
|
||||
/// Responsible for parsing book titles "The man on the street" and removing the prefix -> "man on the street".
|
||||
/// </summary>
|
||||
/// <remarks>This code is performance sensitive</remarks>
|
||||
public static class BookSortTitlePrefixHelper
|
||||
{
|
||||
private static readonly Dictionary<string, byte> PrefixLookup;
|
||||
private static readonly Dictionary<char, List<string>> PrefixesByFirstChar;
|
||||
|
||||
static BookSortTitlePrefixHelper()
|
||||
{
|
||||
var prefixes = new[]
|
||||
{
|
||||
// English
|
||||
"the", "a", "an",
|
||||
// Spanish
|
||||
"el", "la", "los", "las", "un", "una", "unos", "unas",
|
||||
// French
|
||||
"le", "la", "les", "un", "une", "des",
|
||||
// German
|
||||
"der", "die", "das", "den", "dem", "ein", "eine", "einen", "einer",
|
||||
// Italian
|
||||
"il", "lo", "la", "gli", "le", "un", "uno", "una",
|
||||
// Portuguese
|
||||
"o", "a", "os", "as", "um", "uma", "uns", "umas",
|
||||
// Russian (transliterated common ones)
|
||||
"в", "на", "с", "к", "от", "для",
|
||||
};
|
||||
|
||||
// Build lookup structures
|
||||
PrefixLookup = new Dictionary<string, byte>(prefixes.Length, StringComparer.OrdinalIgnoreCase);
|
||||
PrefixesByFirstChar = new Dictionary<char, List<string>>();
|
||||
|
||||
foreach (var prefix in prefixes)
|
||||
{
|
||||
PrefixLookup[prefix] = 1;
|
||||
|
||||
var firstChar = char.ToLowerInvariant(prefix[0]);
|
||||
if (!PrefixesByFirstChar.TryGetValue(firstChar, out var list))
|
||||
{
|
||||
list = [];
|
||||
PrefixesByFirstChar[firstChar] = list;
|
||||
}
|
||||
list.Add(prefix);
|
||||
}
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static ReadOnlySpan<char> GetSortTitle(ReadOnlySpan<char> title)
|
||||
{
|
||||
if (title.IsEmpty) return title;
|
||||
|
||||
// Fast detection of script type by first character
|
||||
var firstChar = title[0];
|
||||
|
||||
// CJK Unicode ranges - no processing needed for most cases
|
||||
if ((firstChar >= 0x4E00 && firstChar <= 0x9FFF) || // CJK Unified
|
||||
(firstChar >= 0x3040 && firstChar <= 0x309F) || // Hiragana
|
||||
(firstChar >= 0x30A0 && firstChar <= 0x30FF)) // Katakana
|
||||
{
|
||||
return title;
|
||||
}
|
||||
|
||||
var firstSpaceIndex = title.IndexOf(' ');
|
||||
if (firstSpaceIndex <= 0) return title;
|
||||
|
||||
var potentialPrefix = title.Slice(0, firstSpaceIndex);
|
||||
|
||||
// Fast path: check if first character could match any prefix
|
||||
firstChar = char.ToLowerInvariant(potentialPrefix[0]);
|
||||
if (!PrefixesByFirstChar.ContainsKey(firstChar))
|
||||
return title;
|
||||
|
||||
// Only do the expensive lookup if first character matches
|
||||
if (PrefixLookup.ContainsKey(potentialPrefix.ToString()))
|
||||
{
|
||||
var remainder = title.Slice(firstSpaceIndex + 1);
|
||||
return remainder.IsEmpty ? title : remainder;
|
||||
}
|
||||
|
||||
return title;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Removes the sort prefix
|
||||
/// </summary>
|
||||
/// <param name="title"></param>
|
||||
/// <returns></returns>
|
||||
public static string GetSortTitle(string title)
|
||||
{
|
||||
var result = GetSortTitle(title.AsSpan());
|
||||
|
||||
return result.ToString();
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue