PDF Metadata Support (#3552)

Co-authored-by: Matthias Neeracher <microtherion@gmail.com>
This commit is contained in:
Joe Milazzo 2025-02-16 15:10:15 -06:00 committed by GitHub
parent 56108eb373
commit f76de42b28
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
24 changed files with 1949 additions and 57 deletions

View file

@ -0,0 +1,159 @@
/// Translate PDF metadata (See PdfMetadataExtractor.cs) into ComicInfo structure.
// Contributed by https://github.com/microtherion
// All references to the "PDF Spec" (section numbers, etc) refer to the
// PDF 1.7 Specification a.k.a. PDF32000-1:2008
// https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf
using System;
using System.Xml;
using System.Text;
using System.IO;
using System.Diagnostics;
using API.Data.Metadata;
using API.Entities.Enums;
using API.Services;
using API.Services.Tasks.Scanner.Parser;
using Microsoft.Extensions.Logging;
using Nager.ArticleNumber;
using System.Collections.Generic;
namespace API.Helpers;
#nullable enable
public interface IPdfComicInfoExtractor
{
ComicInfo? GetComicInfo(string filePath);
}
public class PdfComicInfoExtractor : IPdfComicInfoExtractor
{
private readonly ILogger<BookService> _logger;
private readonly IMediaErrorService _mediaErrorService;
private readonly string[] _pdfDateFormats = [ // PDF Spec 7.9.4
"D:yyyyMMddHHmmsszzz:", "D:yyyyMMddHHmmss+", "D:yyyyMMddHHmmss",
"D:yyyyMMddHHmmzzz:", "D:yyyyMMddHHmm+", "D:yyyyMMddHHmm",
"D:yyyyMMddHHzzz:", "D:yyyyMMddHH+", "D:yyyyMMddHH",
"D:yyyyMMdd", "D:yyyyMM", "D:yyyy"
];
public PdfComicInfoExtractor(ILogger<BookService> logger, IMediaErrorService mediaErrorService)
{
_logger = logger;
_mediaErrorService = mediaErrorService;
}
private float? GetFloatFromText(string? text)
{
if (string.IsNullOrEmpty(text)) return null;
if (float.TryParse(text, out var value)) return value;
return null;
}
private DateTime? GetDateTimeFromText(string? text)
{
if (string.IsNullOrEmpty(text)) return null;
// Dates stored in the XMP metadata stream (PDF Spec 14.3.2)
// are stored in ISO 8601 format, which is handled by C# out of the box
if (DateTime.TryParse(text, out var date)) return date;
// Dates stored in the document information directory (PDF Spec 14.3.3)
// are stored in a proprietary format (PDF Spec 7.9.4) that needs to be
// massaged slightly to be expressible by a DateTime format.
if (text[0] != 'D') {
text = "D:" + text;
}
text = text.Replace("'", ":");
text = text.Replace("Z", "+");
foreach(var format in _pdfDateFormats)
{
if (DateTime.TryParseExact(text, format, null, System.Globalization.DateTimeStyles.None, out var pdfDate)) return pdfDate;
}
return null;
}
private string? MaybeGetMetadata(Dictionary<string, string> metadata, string key)
{
return metadata.ContainsKey(key) ? metadata[key] : null;
}
private ComicInfo? GetComicInfoFromMetadata(Dictionary<string, string> metadata, string filePath)
{
var info = new ComicInfo();
var publicationDate = GetDateTimeFromText(MaybeGetMetadata(metadata, "CreationDate"));
if (publicationDate != null)
{
info.Year = publicationDate.Value.Year;
info.Month = publicationDate.Value.Month;
info.Day = publicationDate.Value.Day;
}
info.Summary = MaybeGetMetadata(metadata, "Summary") ?? string.Empty;
info.Publisher = MaybeGetMetadata(metadata, "Publisher") ?? string.Empty;
info.Writer = MaybeGetMetadata(metadata, "Author") ?? string.Empty;
info.Title = MaybeGetMetadata(metadata, "Title") ?? string.Empty;
info.Genre = MaybeGetMetadata(metadata, "Subject") ?? string.Empty;
info.LanguageISO = BookService.ValidateLanguage(MaybeGetMetadata(metadata, "Language"));
info.Isbn = MaybeGetMetadata(metadata, "ISBN") ?? string.Empty;
if (info.Isbn != string.Empty && !ArticleNumberHelper.IsValidIsbn10(info.Isbn) && !ArticleNumberHelper.IsValidIsbn13(info.Isbn))
{
_logger.LogDebug("[BookService] {File} has an invalid ISBN number", filePath);
info.Isbn = string.Empty;
}
info.UserRating = GetFloatFromText(MaybeGetMetadata(metadata, "UserRating")) ?? 0.0f;
info.TitleSort = MaybeGetMetadata(metadata, "TitleSort") ?? string.Empty;
info.Series = MaybeGetMetadata(metadata, "Series") ?? info.TitleSort;
info.SeriesSort = info.Series;
info.Volume = (GetFloatFromText(MaybeGetMetadata(metadata, "Volume")) ?? 0.0f).ToString();
// If this is a single book and not a collection, set publication status to Completed
if (string.IsNullOrEmpty(info.Volume) && Parser.ParseVolume(filePath, LibraryType.Manga).Equals(Parser.LooseLeafVolume))
{
info.Count = 1;
}
// Removed as probably unneeded per discussion in https://github.com/Kareadita/Kavita/pull/3108#discussion_r1956747782
//
// var hasVolumeInSeries = !Parser.ParseVolume(info.Title, LibraryType.Manga)
// .Equals(Parser.LooseLeafVolume);
// if (string.IsNullOrEmpty(info.Volume) && hasVolumeInSeries && (!info.Series.Equals(info.Title) || string.IsNullOrEmpty(info.Series)))
// {
// // This is likely a light novel for which we can set series from parsed title
// info.Series = Parser.ParseSeries(info.Title, LibraryType.Manga);
// info.Volume = Parser.ParseVolume(info.Title, LibraryType.Manga);
// }
ComicInfo.CleanComicInfo(info);
return info;
}
public ComicInfo? GetComicInfo(string filePath)
{
try
{
var extractor = new PdfMetadataExtractor(_logger, filePath);
return GetComicInfoFromMetadata(extractor.GetMetadata(), filePath);
}
catch (Exception ex)
{
_logger.LogWarning(ex, "[GetComicInfo] There was an exception parsing PDF metadata for {File}", filePath);
_mediaErrorService.ReportMediaIssue(filePath, MediaErrorProducer.BookService,
"There was an exception parsing PDF metadata", ex);
}
return null;
}
}

File diff suppressed because it is too large Load diff

View file

@ -6,12 +6,14 @@ using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using System.Xml;
using API.Data.Metadata;
using API.DTOs.Reader;
using API.Entities;
using API.Entities.Enums;
using API.Extensions;
using API.Services.Tasks.Scanner.Parser;
using API.Helpers;
using Docnet.Core;
using Docnet.Core.Converters;
using Docnet.Core.Models;
@ -69,6 +71,8 @@ public class BookService : IBookService
private static readonly RecyclableMemoryStreamManager StreamManager = new ();
private const string CssScopeClass = ".book-content";
private const string BookApiUrl = "book-resources?file=";
private readonly PdfComicInfoExtractor _pdfComicInfoExtractor;
public static readonly EpubReaderOptions BookReaderOptions = new()
{
PackageReaderOptions = new PackageReaderOptions
@ -84,6 +88,7 @@ public class BookService : IBookService
_directoryService = directoryService;
_imageService = imageService;
_mediaErrorService = mediaErrorService;
_pdfComicInfoExtractor = new PdfComicInfoExtractor(_logger, _mediaErrorService);
}
private static bool HasClickableHrefPart(HtmlNode anchor)
@ -425,10 +430,8 @@ public class BookService : IBookService
}
}
public ComicInfo? GetComicInfo(string filePath)
private ComicInfo? GetEpubComicInfo(string filePath)
{
if (!IsValidFile(filePath) || Parser.IsPdf(filePath)) return null;
try
{
using var epubBook = EpubReader.OpenBook(filePath, BookReaderOptions);
@ -442,7 +445,7 @@ public class BookService : IBookService
var (year, month, day) = GetPublicationDate(publicationDate);
var summary = epubBook.Schema.Package.Metadata.Descriptions.FirstOrDefault();
var info = new ComicInfo
var info = new ComicInfo
{
Summary = string.IsNullOrEmpty(summary?.Description) ? string.Empty : summary.Description,
Publisher = string.Join(",", epubBook.Schema.Package.Metadata.Publishers.Select(p => p.Publisher)),
@ -583,6 +586,20 @@ public class BookService : IBookService
return null;
}
public ComicInfo? GetComicInfo(string filePath)
{
if (!IsValidFile(filePath)) return null;
if (Parser.IsPdf(filePath))
{
return _pdfComicInfoExtractor.GetComicInfo(filePath);
}
else
{
return GetEpubComicInfo(filePath);
}
}
private static void ExtractSortTitle(EpubMetadataMeta metadataItem, EpubBookRef epubBook, ComicInfo info)
{
var titleId = metadataItem.Refines?.Replace("#", string.Empty);
@ -685,7 +702,7 @@ public class BookService : IBookService
return (year, month, day);
}
private static string ValidateLanguage(string? language)
public static string ValidateLanguage(string? language)
{
if (string.IsNullOrEmpty(language)) return string.Empty;

View file

@ -566,7 +566,6 @@ public class ExternalMetadataService : IExternalMetadataService
return false;
}
var relatedSeriesDict = new Dictionary<int, Series>();
foreach (var relation in externalMetadataRelations)
{
var names = new [] {relation.SeriesName.PreferredTitle, relation.SeriesName.RomajiTitle, relation.SeriesName.EnglishTitle, relation.SeriesName.NativeTitle};
@ -586,19 +585,6 @@ public class ExternalMetadataService : IExternalMetadataService
if (relationshipExists) continue;
relatedSeriesDict[relatedSeries.Id] = relatedSeries;
}
// Process relationships
foreach (var relation in externalMetadataRelations)
{
var relatedSeries = relatedSeriesDict.GetValueOrDefault(
relatedSeriesDict.Keys.FirstOrDefault(k =>
relatedSeriesDict[k].Name == relation.SeriesName.PreferredTitle ||
relatedSeriesDict[k].Name == relation.SeriesName.NativeTitle));
if (relatedSeries == null) continue;
// Add new relationship
var newRelation = new SeriesRelation
{
@ -969,7 +955,7 @@ public class ExternalMetadataService : IExternalMetadataService
return false;
}
if (!string.IsNullOrEmpty(externalMetadata.CoverUrl) && !settings.HasOverride(MetadataSettingField.Covers))
if (string.IsNullOrEmpty(externalMetadata.CoverUrl))
{
return false;
}

View file

@ -52,7 +52,7 @@ public class ReadingItemService : IReadingItemService
/// <returns></returns>
private ComicInfo? GetComicInfo(string filePath)
{
if (Parser.IsEpub(filePath))
if (Parser.IsEpub(filePath) || Parser.IsPdf(filePath))
{
return _bookService.GetComicInfo(filePath);
}

View file

@ -68,6 +68,9 @@ public class PdfParser(IDirectoryService directoryService) : DefaultParser(direc
ParseFromFallbackFolders(filePath, tempRootPath, type, ref ret);
}
// Patch in other information from ComicInfo
UpdateFromComicInfo(ret);
if (ret.Chapters == Parser.DefaultChapter && ret.Volumes == Parser.LooseLeafVolume && type == LibraryType.Book)
{
ret.IsSpecial = true;

View file

@ -285,7 +285,7 @@ public class ProcessSeries : IProcessSeries
var firstChapter = SeriesService.GetFirstChapterForMetadata(series);
var firstFile = firstChapter?.Files.FirstOrDefault();
if (firstFile == null || Parser.Parser.IsPdf(firstFile.FilePath)) return;
if (firstFile == null) return;
var chapters = series.Volumes
.SelectMany(volume => volume.Chapters)

View file

@ -2,7 +2,7 @@
"TokenKey": "super secret unguessable key that is longer because we require it",
"Port": 5000,
"IpAddresses": "0.0.0.0,::",
"BaseUrl": "/test/",
"BaseUrl": "/",
"Cache": 75,
"AllowIFraming": false
}