Setup the parsing rules for Magazines.

This commit is contained in:
Joseph Milazzo 2024-02-10 13:10:06 -06:00
parent a443be7523
commit 5a522b6d5b
3 changed files with 147 additions and 22 deletions

View file

@ -6,15 +6,38 @@ public class MagazineParserTests
{
[Theory]
[InlineData("3D World - 2018 UK", "3D World")]
[InlineData("3D World - 2018", "3D World")]
[InlineData("UK World - 022012 [Digital]", "UK World")]
[InlineData("Computer Weekly - September 2023", "Computer Weekly")]
public void ParseSeriesTest(string filename, string expected)
{
Assert.Equal(expected, API.Services.Tasks.Scanner.Parser.Parser.ParseMagazineSeries(filename));
}
[Theory]
[InlineData("UK World - 022012 [Digital]", "2012")]
[InlineData("Computer Weekly - September 2023", "2023")]
[InlineData("Computer Weekly - September 2023 #2", "2023")]
[InlineData("PC Games - 2001 #01", "2001")]
public void ParseVolumeTest(string filename, string expected)
{
Assert.Equal(expected, API.Services.Tasks.Scanner.Parser.Parser.ParseMagazineVolume(filename));
}
[Theory]
[InlineData("UK World - 022012 [Digital]", "0")]
[InlineData("Computer Weekly - September 2023", "9")]
[InlineData("Computer Weekly - September 2023 #2", "2")]
[InlineData("PC Games - 2001 #01", "1")]
public void ParseChapterTest(string filename, string expected)
{
Assert.Equal(expected, API.Services.Tasks.Scanner.Parser.Parser.ParseMagazineChapter(filename));
}
// [Theory]
// [InlineData("Harrison, Kim - Dates from Hell - Hollows Vol 2.5.epub", "2.5")]
// public void ParseVolumeTest(string filename, string expected)
// [InlineData("AIR International Vol. 14 No. 3 (ISSN 1011-3250)", "1011-3250")]
// public void ParseGTINTest(string filename, string expected)
// {
// Assert.Equal(expected, API.Services.Tasks.Scanner.Parser.Parser.ParseMagazineVolume(filename));
// Assert.Equal(expected, API.Services.Tasks.Scanner.Parser.Parser.ParseGTIN(filename));
// }
}

View file

@ -1,5 +1,7 @@
using System;
using System.Collections.Generic;
using System.Collections.Immutable;
using System.Globalization;
using System.IO;
using System.Linq;
using System.Text.RegularExpressions;
@ -9,7 +11,7 @@ using API.Extensions;
namespace API.Services.Tasks.Scanner.Parser;
#nullable enable
public static class Parser
public static partial class Parser
{
public const string DefaultChapter = "0";
public const string DefaultVolume = "0";
@ -59,6 +61,8 @@ public static class Parser
/// </summary>
private const string CommonSpecial = @"Specials?|One[- ]?Shot|Extra(?:\sChapter)?(?=\s)|Art Collection|Side Stories|Bonus";
[GeneratedRegex(@"^\d+$")]
private static partial Regex IsNumberRegex();
/// <summary>
/// Matches against font-family css syntax. Does not match if url import has data: starting, as that is binary data
@ -634,13 +638,17 @@ public static class Parser
#region Magazine
private static readonly Dictionary<string, int> _monthMappings = CreateMonthMappings();
private static readonly Regex[] MagazineSeriesRegex = new[]
{
// 3D World - 2018 UK
// 3D World - 2018 UK, 3D World - 022014
new Regex(
@"(?<Series>.+?)(\b|_|\s)?-(\b|_|\s)(?<Year>\d{4}).+",
@"^(?<Series>.+?)(_|\s)*-(_|\s)*\d{4,6}.*",
MatchOptions, RegexTimeout),
// AIR International - April 2018 UK
new Regex(
@"^(?<Series>.+?)(_|\s)*-(_|\s)*.*",
MatchOptions, RegexTimeout),
// The New Yorker - April 2, 2018 USA
// AIR International Magazine 2006
// AIR International Vol. 14 No. 3 (ISSN 1011-3250)
@ -648,9 +656,34 @@ public static class Parser
private static readonly Regex[] MagazineVolumeRegex = new[]
{
// Batman & Wildcat (1 of 3)
// 3D World - 2018 UK, 3D World - 022014
new Regex(
@"(?<Series>.*(\d{4})?)( |_)(?:\((?<Chapter>\d+) of \d+)",
@"^(?<Series>.+?)(_|\s)*-(_|\s)*\d{2}?(?<Volume>\d{4}).*",
MatchOptions, RegexTimeout),
// 3D World - Sept 2018
new Regex(
@"^(?<Series>.+?)(_|\s)*-(_|\s)*\D+(?<Volume>\d{4}).*",
MatchOptions, RegexTimeout),
// 3D World - Sept 2018
new Regex(
@"^(?<Series>.+?)(_|\s)*-(_|\s)*\D+(?<Volume>\d{4}).*",
MatchOptions, RegexTimeout),
};
private static readonly Regex[] MagazineChapterRegex = new[]
{
// 3D World - September 2023 #2
new Regex(
@"^(?<Series>.+?)(_|\s)*-(_|\s)*.*#(?<Chapter>\d+).*",
MatchOptions, RegexTimeout),
// Computer Weekly - September 2023
new Regex(
@"^(?<Series>.+?)(_|\s)*-(_|\s)*(?<Chapter>January|February|March|April|May|June|July|August|September|October|November|December).*",
MatchOptions, RegexTimeout),
// Computer Weekly - Sept 2023
new Regex(
@"^(?<Series>.+?)(_|\s)*-(_|\s)*(?<Chapter>Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sept|Oct|Nov|Dec).*",
MatchOptions, RegexTimeout),
};
@ -801,14 +834,71 @@ public static class Parser
if (!group["Volume"].Success || group["Volume"] == Match.Empty) continue;
var value = group["Volume"].Value;
var hasPart = group["Part"].Success;
return FormatValue(value, hasPart);
return FormatValue(value, false);
}
}
return DefaultVolume;
}
private static Dictionary<string, int> CreateMonthMappings()
{
Dictionary<string, int> mappings = new(StringComparer.OrdinalIgnoreCase);
// Add English month names and shorthands
for (var i = 1; i <= 12; i++)
{
var month = new DateTime(2022, i, 1);
var monthName = month.ToString("MMMM", CultureInfo.InvariantCulture);
var monthAbbreviation = month.ToString("MMM", CultureInfo.InvariantCulture);
mappings[monthName] = i;
mappings[monthAbbreviation] = i;
}
// Add mappings for other languages if needed
// Example: mappings["KoreanMonthName"] = correspondingNumericalValue;
return mappings;
}
static int ConvertMonthToNumber(string month, Dictionary<string, int> monthMappings)
{
// Check if the month exists in the mappings
if (monthMappings.TryGetValue(month, out int numericalValue))
{
return numericalValue;
}
// If the month is not found in mappings, you may handle other cases here,
// such as trying to parse non-English month names or returning a default value.
// For simplicity, we'll return 0 indicating failure.
return 0;
}
public static string ParseMagazineChapter(string filename)
{
foreach (var regex in MagazineChapterRegex)
{
var matches = regex.Matches(filename);
foreach (var groups in matches.Select(match => match.Groups))
{
if (!groups["Chapter"].Success || groups["Chapter"] == Match.Empty) continue;
var value = groups["Chapter"].Value;
// If value has non-digits, we need to convert to a digit
if (IsNumberRegex().IsMatch(value)) return FormatValue(value, false);
if (_monthMappings.TryGetValue(value, out var parsedMonth))
{
return FormatValue($"{parsedMonth}", false);
}
}
}
return DefaultChapter;
}
private static string FormatValue(string value, bool hasPart)
{
if (!value.Contains('-'))
@ -1155,4 +1245,5 @@ public static class Parser
return null;
}
}

View file

@ -7,7 +7,7 @@
"name": "GPL-3.0",
"url": "https://github.com/Kareadita/Kavita/blob/develop/LICENSE"
},
"version": "0.7.14.1"
"version": "0.7.14.2"
},
"servers": [
{
@ -2909,7 +2909,8 @@
1,
2,
3,
4
4,
5
],
"type": "integer",
"format": "int32"
@ -2922,7 +2923,8 @@
1,
2,
3,
4
4,
5
],
"type": "integer",
"format": "int32"
@ -2935,7 +2937,8 @@
1,
2,
3,
4
4,
5
],
"type": "integer",
"format": "int32"
@ -3619,7 +3622,8 @@
1,
2,
3,
4
4,
5
],
"type": "integer",
"format": "int32"
@ -13526,7 +13530,8 @@
1,
2,
3,
4
4,
5
],
"type": "integer",
"format": "int32"
@ -14119,7 +14124,8 @@
1,
2,
3,
4
4,
5
],
"type": "integer",
"description": "Library type",
@ -15885,7 +15891,8 @@
1,
2,
3,
4
4,
5
],
"type": "integer",
"format": "int32"
@ -15999,7 +16006,8 @@
1,
2,
3,
4
4,
5
],
"type": "integer",
"format": "int32"
@ -16989,7 +16997,8 @@
1,
2,
3,
4
4,
5
],
"type": "integer",
"format": "int32"
@ -17042,7 +17051,8 @@
1,
2,
3,
4
4,
5
],
"type": "integer",
"format": "int32"
@ -19381,7 +19391,8 @@
1,
2,
3,
4
4,
5
],
"type": "integer",
"format": "int32"