PDF Metadata Support (#3552)

Co-authored-by: Matthias Neeracher <microtherion@gmail.com>
2025-02-16 15:10:15 -06:00 · 2025-02-16 15:10:15 -06:00 · f76de42b28
commit f76de42b28
parent 56108eb373
24 changed files with 1949 additions and 57 deletions
--- a/API/Helpers/PdfComicInfoExtractor.cs
+++ b/API/Helpers/PdfComicInfoExtractor.cs
@ -0,0 +1,159 @@
+/// Translate PDF metadata (See PdfMetadataExtractor.cs) into ComicInfo structure.
+
+// Contributed by https://github.com/microtherion
+
+// All references to the "PDF Spec" (section numbers, etc) refer to the
+// PDF 1.7 Specification a.k.a. PDF32000-1:2008
+// https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf
+
+using System;
+using System.Xml;
+using System.Text;
+using System.IO;
+using System.Diagnostics;
+using API.Data.Metadata;
+using API.Entities.Enums;
+using API.Services;
+using API.Services.Tasks.Scanner.Parser;
+using Microsoft.Extensions.Logging;
+using Nager.ArticleNumber;
+using System.Collections.Generic;
+
+namespace API.Helpers;
+#nullable enable
+
+public interface IPdfComicInfoExtractor
+{
+    ComicInfo? GetComicInfo(string filePath);
+}
+
+public class PdfComicInfoExtractor : IPdfComicInfoExtractor
+{
+    private readonly ILogger<BookService> _logger;
+    private readonly IMediaErrorService _mediaErrorService;
+    private readonly string[] _pdfDateFormats = [ // PDF Spec 7.9.4
+            "D:yyyyMMddHHmmsszzz:", "D:yyyyMMddHHmmss+", "D:yyyyMMddHHmmss",
+            "D:yyyyMMddHHmmzzz:",  "D:yyyyMMddHHmm+",   "D:yyyyMMddHHmm",
+            "D:yyyyMMddHHzzz:", "D:yyyyMMddHH+", "D:yyyyMMddHH",
+            "D:yyyyMMdd", "D:yyyyMM", "D:yyyy"
+        ];
+
+    public PdfComicInfoExtractor(ILogger<BookService> logger, IMediaErrorService mediaErrorService)
+    {
+        _logger = logger;
+        _mediaErrorService = mediaErrorService;
+    }
+
+    private float? GetFloatFromText(string? text)
+    {
+        if (string.IsNullOrEmpty(text)) return null;
+
+        if (float.TryParse(text, out var value)) return value;
+
+        return null;
+    }
+
+    private DateTime? GetDateTimeFromText(string? text)
+    {
+        if (string.IsNullOrEmpty(text)) return null;
+
+        // Dates stored in the XMP metadata stream (PDF Spec 14.3.2)
+        // are stored in ISO 8601 format, which is handled by C# out of the box
+        if (DateTime.TryParse(text, out var date)) return date;
+
+        // Dates stored in the document information directory (PDF Spec 14.3.3)
+        // are stored in a proprietary format (PDF Spec 7.9.4) that needs to be
+        // massaged slightly to be expressible by a DateTime format.
+        if (text[0] != 'D') {
+            text = "D:" + text;
+        }
+        text = text.Replace("'", ":");
+        text = text.Replace("Z", "+");
+
+        foreach(var format in _pdfDateFormats)
+        {
+            if (DateTime.TryParseExact(text, format, null, System.Globalization.DateTimeStyles.None, out var pdfDate)) return pdfDate;
+        }
+
+        return null;
+    }
+
+    private string? MaybeGetMetadata(Dictionary<string, string> metadata, string key)
+    {
+        return metadata.ContainsKey(key) ? metadata[key] : null;
+    }
+
+    private ComicInfo? GetComicInfoFromMetadata(Dictionary<string, string> metadata, string filePath)
+    {
+        var info = new ComicInfo();
+
+        var publicationDate = GetDateTimeFromText(MaybeGetMetadata(metadata, "CreationDate"));
+
+        if (publicationDate != null)
+        {
+            info.Year  = publicationDate.Value.Year;
+            info.Month = publicationDate.Value.Month;
+            info.Day   = publicationDate.Value.Day;
+        }
+
+        info.Summary   = MaybeGetMetadata(metadata, "Summary") ?? string.Empty;
+        info.Publisher = MaybeGetMetadata(metadata, "Publisher") ?? string.Empty;
+        info.Writer    = MaybeGetMetadata(metadata, "Author") ?? string.Empty;
+        info.Title     = MaybeGetMetadata(metadata, "Title") ?? string.Empty;
+        info.Genre     = MaybeGetMetadata(metadata, "Subject") ?? string.Empty;
+        info.LanguageISO = BookService.ValidateLanguage(MaybeGetMetadata(metadata, "Language"));
+        info.Isbn      = MaybeGetMetadata(metadata, "ISBN") ?? string.Empty;
+
+        if (info.Isbn != string.Empty && !ArticleNumberHelper.IsValidIsbn10(info.Isbn) && !ArticleNumberHelper.IsValidIsbn13(info.Isbn))
+        {
+            _logger.LogDebug("[BookService] {File} has an invalid ISBN number", filePath);
+            info.Isbn = string.Empty;
+        }
+
+        info.UserRating = GetFloatFromText(MaybeGetMetadata(metadata, "UserRating")) ?? 0.0f;
+        info.TitleSort  = MaybeGetMetadata(metadata, "TitleSort") ?? string.Empty;
+        info.Series     = MaybeGetMetadata(metadata, "Series") ?? info.TitleSort;
+        info.SeriesSort = info.Series;
+        info.Volume     = (GetFloatFromText(MaybeGetMetadata(metadata, "Volume")) ?? 0.0f).ToString();
+
+        // If this is a single book and not a collection, set publication status to Completed
+        if (string.IsNullOrEmpty(info.Volume) && Parser.ParseVolume(filePath, LibraryType.Manga).Equals(Parser.LooseLeafVolume))
+        {
+            info.Count = 1;
+        }
+
+        // Removed as probably unneeded per discussion in https://github.com/Kareadita/Kavita/pull/3108#discussion_r1956747782
+        //
+        // var hasVolumeInSeries = !Parser.ParseVolume(info.Title, LibraryType.Manga)
+        //     .Equals(Parser.LooseLeafVolume);
+
+        // if (string.IsNullOrEmpty(info.Volume) && hasVolumeInSeries && (!info.Series.Equals(info.Title) || string.IsNullOrEmpty(info.Series)))
+        // {
+        //     // This is likely a light novel for which we can set series from parsed title
+        //     info.Series = Parser.ParseSeries(info.Title, LibraryType.Manga);
+        //     info.Volume = Parser.ParseVolume(info.Title, LibraryType.Manga);
+        // }
+
+        ComicInfo.CleanComicInfo(info);
+
+        return info;
+    }
+
+    public ComicInfo? GetComicInfo(string filePath)
+    {
+        try
+        {
+            var extractor = new PdfMetadataExtractor(_logger, filePath);
+
+            return GetComicInfoFromMetadata(extractor.GetMetadata(), filePath);
+        }
+        catch (Exception ex)
+        {
+            _logger.LogWarning(ex, "[GetComicInfo] There was an exception parsing PDF metadata for {File}", filePath);
+            _mediaErrorService.ReportMediaIssue(filePath, MediaErrorProducer.BookService,
+                "There was an exception parsing PDF metadata", ex);
+        }
+
+        return null;
+    }
+}