PDF Metadata Support (#3552)

Co-authored-by: Matthias Neeracher <microtherion@gmail.com>
2025-02-16 15:10:15 -06:00 · 2025-02-16 15:10:15 -06:00 · f76de42b28
commit f76de42b28
parent 56108eb373
24 changed files with 1949 additions and 57 deletions
--- a/API/Helpers/PdfComicInfoExtractor.cs
+++ b/API/Helpers/PdfComicInfoExtractor.cs
@ -0,0 +1,159 @@
+/// Translate PDF metadata (See PdfMetadataExtractor.cs) into ComicInfo structure.
+
+// Contributed by https://github.com/microtherion
+
+// All references to the "PDF Spec" (section numbers, etc) refer to the
+// PDF 1.7 Specification a.k.a. PDF32000-1:2008
+// https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf
+
+using System;
+using System.Xml;
+using System.Text;
+using System.IO;
+using System.Diagnostics;
+using API.Data.Metadata;
+using API.Entities.Enums;
+using API.Services;
+using API.Services.Tasks.Scanner.Parser;
+using Microsoft.Extensions.Logging;
+using Nager.ArticleNumber;
+using System.Collections.Generic;
+
+namespace API.Helpers;
+#nullable enable
+
+public interface IPdfComicInfoExtractor
+{
+    ComicInfo? GetComicInfo(string filePath);
+}
+
+public class PdfComicInfoExtractor : IPdfComicInfoExtractor
+{
+    private readonly ILogger<BookService> _logger;
+    private readonly IMediaErrorService _mediaErrorService;
+    private readonly string[] _pdfDateFormats = [ // PDF Spec 7.9.4
+            "D:yyyyMMddHHmmsszzz:", "D:yyyyMMddHHmmss+", "D:yyyyMMddHHmmss",
+            "D:yyyyMMddHHmmzzz:",  "D:yyyyMMddHHmm+",   "D:yyyyMMddHHmm",
+            "D:yyyyMMddHHzzz:", "D:yyyyMMddHH+", "D:yyyyMMddHH",
+            "D:yyyyMMdd", "D:yyyyMM", "D:yyyy"
+        ];
+
+    public PdfComicInfoExtractor(ILogger<BookService> logger, IMediaErrorService mediaErrorService)
+    {
+        _logger = logger;
+        _mediaErrorService = mediaErrorService;
+    }
+
+    private float? GetFloatFromText(string? text)
+    {
+        if (string.IsNullOrEmpty(text)) return null;
+
+        if (float.TryParse(text, out var value)) return value;
+
+        return null;
+    }
+
+    private DateTime? GetDateTimeFromText(string? text)
+    {
+        if (string.IsNullOrEmpty(text)) return null;
+
+        // Dates stored in the XMP metadata stream (PDF Spec 14.3.2)
+        // are stored in ISO 8601 format, which is handled by C# out of the box
+        if (DateTime.TryParse(text, out var date)) return date;
+
+        // Dates stored in the document information directory (PDF Spec 14.3.3)
+        // are stored in a proprietary format (PDF Spec 7.9.4) that needs to be
+        // massaged slightly to be expressible by a DateTime format.
+        if (text[0] != 'D') {
+            text = "D:" + text;
+        }
+        text = text.Replace("'", ":");
+        text = text.Replace("Z", "+");
+
+        foreach(var format in _pdfDateFormats)
+        {
+            if (DateTime.TryParseExact(text, format, null, System.Globalization.DateTimeStyles.None, out var pdfDate)) return pdfDate;
+        }
+
+        return null;
+    }
+
+    private string? MaybeGetMetadata(Dictionary<string, string> metadata, string key)
+    {
+        return metadata.ContainsKey(key) ? metadata[key] : null;
+    }
+
+    private ComicInfo? GetComicInfoFromMetadata(Dictionary<string, string> metadata, string filePath)
+    {
+        var info = new ComicInfo();
+
+        var publicationDate = GetDateTimeFromText(MaybeGetMetadata(metadata, "CreationDate"));
+
+        if (publicationDate != null)
+        {
+            info.Year  = publicationDate.Value.Year;
+            info.Month = publicationDate.Value.Month;
+            info.Day   = publicationDate.Value.Day;
+        }
+
+        info.Summary   = MaybeGetMetadata(metadata, "Summary") ?? string.Empty;
+        info.Publisher = MaybeGetMetadata(metadata, "Publisher") ?? string.Empty;
+        info.Writer    = MaybeGetMetadata(metadata, "Author") ?? string.Empty;
+        info.Title     = MaybeGetMetadata(metadata, "Title") ?? string.Empty;
+        info.Genre     = MaybeGetMetadata(metadata, "Subject") ?? string.Empty;
+        info.LanguageISO = BookService.ValidateLanguage(MaybeGetMetadata(metadata, "Language"));
+        info.Isbn      = MaybeGetMetadata(metadata, "ISBN") ?? string.Empty;
+
+        if (info.Isbn != string.Empty && !ArticleNumberHelper.IsValidIsbn10(info.Isbn) && !ArticleNumberHelper.IsValidIsbn13(info.Isbn))
+        {
+            _logger.LogDebug("[BookService] {File} has an invalid ISBN number", filePath);
+            info.Isbn = string.Empty;
+        }
+
+        info.UserRating = GetFloatFromText(MaybeGetMetadata(metadata, "UserRating")) ?? 0.0f;
+        info.TitleSort  = MaybeGetMetadata(metadata, "TitleSort") ?? string.Empty;
+        info.Series     = MaybeGetMetadata(metadata, "Series") ?? info.TitleSort;
+        info.SeriesSort = info.Series;
+        info.Volume     = (GetFloatFromText(MaybeGetMetadata(metadata, "Volume")) ?? 0.0f).ToString();
+
+        // If this is a single book and not a collection, set publication status to Completed
+        if (string.IsNullOrEmpty(info.Volume) && Parser.ParseVolume(filePath, LibraryType.Manga).Equals(Parser.LooseLeafVolume))
+        {
+            info.Count = 1;
+        }
+
+        // Removed as probably unneeded per discussion in https://github.com/Kareadita/Kavita/pull/3108#discussion_r1956747782
+        //
+        // var hasVolumeInSeries = !Parser.ParseVolume(info.Title, LibraryType.Manga)
+        //     .Equals(Parser.LooseLeafVolume);
+
+        // if (string.IsNullOrEmpty(info.Volume) && hasVolumeInSeries && (!info.Series.Equals(info.Title) || string.IsNullOrEmpty(info.Series)))
+        // {
+        //     // This is likely a light novel for which we can set series from parsed title
+        //     info.Series = Parser.ParseSeries(info.Title, LibraryType.Manga);
+        //     info.Volume = Parser.ParseVolume(info.Title, LibraryType.Manga);
+        // }
+
+        ComicInfo.CleanComicInfo(info);
+
+        return info;
+    }
+
+    public ComicInfo? GetComicInfo(string filePath)
+    {
+        try
+        {
+            var extractor = new PdfMetadataExtractor(_logger, filePath);
+
+            return GetComicInfoFromMetadata(extractor.GetMetadata(), filePath);
+        }
+        catch (Exception ex)
+        {
+            _logger.LogWarning(ex, "[GetComicInfo] There was an exception parsing PDF metadata for {File}", filePath);
+            _mediaErrorService.ReportMediaIssue(filePath, MediaErrorProducer.BookService,
+                "There was an exception parsing PDF metadata", ex);
+        }
+
+        return null;
+    }
+}
--- a/API/Helpers/PdfMetadataExtractor.cs
+++ b/API/Helpers/PdfMetadataExtractor.cs
--- a/API/Services/BookService.cs
+++ b/API/Services/BookService.cs
@ -6,12 +6,14 @@ using System.Linq;
 using System.Text;
 using System.Text.RegularExpressions;
 using System.Threading.Tasks;
+using System.Xml;
 using API.Data.Metadata;
 using API.DTOs.Reader;
 using API.Entities;
 using API.Entities.Enums;
 using API.Extensions;
 using API.Services.Tasks.Scanner.Parser;
+using API.Helpers;
 using Docnet.Core;
 using Docnet.Core.Converters;
 using Docnet.Core.Models;
@ -69,6 +71,8 @@ public class BookService : IBookService
    private static readonly RecyclableMemoryStreamManager StreamManager = new ();
    private const string CssScopeClass = ".book-content";
    private const string BookApiUrl = "book-resources?file=";
+    private readonly PdfComicInfoExtractor _pdfComicInfoExtractor;
+
    public static readonly EpubReaderOptions BookReaderOptions = new()
    {
        PackageReaderOptions = new PackageReaderOptions
@ -84,6 +88,7 @@ public class BookService : IBookService
        _directoryService = directoryService;
        _imageService = imageService;
        _mediaErrorService = mediaErrorService;
+        _pdfComicInfoExtractor = new PdfComicInfoExtractor(_logger, _mediaErrorService);
    }

    private static bool HasClickableHrefPart(HtmlNode anchor)
@ -425,10 +430,8 @@ public class BookService : IBookService
        }
    }

-    public ComicInfo? GetComicInfo(string filePath)
+    private ComicInfo? GetEpubComicInfo(string filePath)
    {
-        if (!IsValidFile(filePath) || Parser.IsPdf(filePath)) return null;
-
        try
        {
            using var epubBook = EpubReader.OpenBook(filePath, BookReaderOptions);
@ -442,7 +445,7 @@ public class BookService : IBookService
            var (year, month, day) = GetPublicationDate(publicationDate);

            var summary = epubBook.Schema.Package.Metadata.Descriptions.FirstOrDefault();
-            var info =  new ComicInfo
+            var info = new ComicInfo
            {
                Summary = string.IsNullOrEmpty(summary?.Description) ? string.Empty : summary.Description,
                Publisher = string.Join(",", epubBook.Schema.Package.Metadata.Publishers.Select(p => p.Publisher)),
@ -583,6 +586,20 @@ public class BookService : IBookService
        return null;
    }

+    public ComicInfo? GetComicInfo(string filePath)
+    {
+        if (!IsValidFile(filePath)) return null;
+
+        if (Parser.IsPdf(filePath))
+        {
+            return _pdfComicInfoExtractor.GetComicInfo(filePath);
+        }
+        else
+        {
+            return GetEpubComicInfo(filePath);
+        }
+    }
+
    private static void ExtractSortTitle(EpubMetadataMeta metadataItem, EpubBookRef epubBook, ComicInfo info)
    {
        var titleId = metadataItem.Refines?.Replace("#", string.Empty);
@ -685,7 +702,7 @@ public class BookService : IBookService
        return (year, month, day);
    }

-    private static string ValidateLanguage(string? language)
+    public static string ValidateLanguage(string? language)
    {
        if (string.IsNullOrEmpty(language)) return string.Empty;

--- a/API/Services/Plus/ExternalMetadataService.cs
+++ b/API/Services/Plus/ExternalMetadataService.cs
@ -566,7 +566,6 @@ public class ExternalMetadataService : IExternalMetadataService
            return false;
        }

-        var relatedSeriesDict = new Dictionary<int, Series>();
        foreach (var relation in externalMetadataRelations)
        {
            var names = new [] {relation.SeriesName.PreferredTitle, relation.SeriesName.RomajiTitle, relation.SeriesName.EnglishTitle, relation.SeriesName.NativeTitle};
@ -586,19 +585,6 @@ public class ExternalMetadataService : IExternalMetadataService

            if (relationshipExists) continue;

-            relatedSeriesDict[relatedSeries.Id] = relatedSeries;
-        }
-
-        // Process relationships
-        foreach (var relation in externalMetadataRelations)
-        {
-            var relatedSeries = relatedSeriesDict.GetValueOrDefault(
-                relatedSeriesDict.Keys.FirstOrDefault(k =>
-                    relatedSeriesDict[k].Name == relation.SeriesName.PreferredTitle ||
-                    relatedSeriesDict[k].Name == relation.SeriesName.NativeTitle));
-
-            if (relatedSeries == null) continue;
-
            // Add new relationship
            var newRelation = new SeriesRelation
            {
@ -969,7 +955,7 @@ public class ExternalMetadataService : IExternalMetadataService
            return false;
        }

-        if (!string.IsNullOrEmpty(externalMetadata.CoverUrl) && !settings.HasOverride(MetadataSettingField.Covers))
+        if (string.IsNullOrEmpty(externalMetadata.CoverUrl))
        {
            return false;
        }
--- a/API/Services/ReadingItemService.cs
+++ b/API/Services/ReadingItemService.cs
@ -52,7 +52,7 @@ public class ReadingItemService : IReadingItemService
    /// <returns></returns>
    private ComicInfo? GetComicInfo(string filePath)
    {
-        if (Parser.IsEpub(filePath))
+        if (Parser.IsEpub(filePath) || Parser.IsPdf(filePath))
        {
            return _bookService.GetComicInfo(filePath);
        }
--- a/API/Services/Tasks/Scanner/Parser/PdfParser.cs
+++ b/API/Services/Tasks/Scanner/Parser/PdfParser.cs
@ -68,6 +68,9 @@ public class PdfParser(IDirectoryService directoryService) : DefaultParser(direc
            ParseFromFallbackFolders(filePath, tempRootPath, type, ref ret);
        }

+        // Patch in other information from ComicInfo
+        UpdateFromComicInfo(ret);
+
        if (ret.Chapters == Parser.DefaultChapter && ret.Volumes == Parser.LooseLeafVolume && type == LibraryType.Book)
        {
            ret.IsSpecial = true;
--- a/API/Services/Tasks/Scanner/ProcessSeries.cs
+++ b/API/Services/Tasks/Scanner/ProcessSeries.cs
@ -285,7 +285,7 @@ public class ProcessSeries : IProcessSeries
        var firstChapter = SeriesService.GetFirstChapterForMetadata(series);

        var firstFile = firstChapter?.Files.FirstOrDefault();
-        if (firstFile == null || Parser.Parser.IsPdf(firstFile.FilePath)) return;
+        if (firstFile == null) return;

        var chapters = series.Volumes
            .SelectMany(volume => volume.Chapters)
--- a/API/config/appsettings.Development.json
+++ b/API/config/appsettings.Development.json
@ -2,7 +2,7 @@
  "TokenKey": "super secret unguessable key that is longer because we require it",
  "Port": 5000,
  "IpAddresses": "0.0.0.0,::",
-  "BaseUrl": "/test/",
+  "BaseUrl": "/",
  "Cache": 75,
  "AllowIFraming": false
 }