PDF Metadata Support (#3552)
Co-authored-by: Matthias Neeracher <microtherion@gmail.com>
This commit is contained in:
parent
56108eb373
commit
f76de42b28
24 changed files with 1949 additions and 57 deletions
159
API/Helpers/PdfComicInfoExtractor.cs
Normal file
159
API/Helpers/PdfComicInfoExtractor.cs
Normal file
|
|
@ -0,0 +1,159 @@
|
|||
/// Translate PDF metadata (See PdfMetadataExtractor.cs) into ComicInfo structure.
|
||||
|
||||
// Contributed by https://github.com/microtherion
|
||||
|
||||
// All references to the "PDF Spec" (section numbers, etc) refer to the
|
||||
// PDF 1.7 Specification a.k.a. PDF32000-1:2008
|
||||
// https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf
|
||||
|
||||
using System;
|
||||
using System.Xml;
|
||||
using System.Text;
|
||||
using System.IO;
|
||||
using System.Diagnostics;
|
||||
using API.Data.Metadata;
|
||||
using API.Entities.Enums;
|
||||
using API.Services;
|
||||
using API.Services.Tasks.Scanner.Parser;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Nager.ArticleNumber;
|
||||
using System.Collections.Generic;
|
||||
|
||||
namespace API.Helpers;
|
||||
#nullable enable
|
||||
|
||||
public interface IPdfComicInfoExtractor
|
||||
{
|
||||
ComicInfo? GetComicInfo(string filePath);
|
||||
}
|
||||
|
||||
public class PdfComicInfoExtractor : IPdfComicInfoExtractor
|
||||
{
|
||||
private readonly ILogger<BookService> _logger;
|
||||
private readonly IMediaErrorService _mediaErrorService;
|
||||
private readonly string[] _pdfDateFormats = [ // PDF Spec 7.9.4
|
||||
"D:yyyyMMddHHmmsszzz:", "D:yyyyMMddHHmmss+", "D:yyyyMMddHHmmss",
|
||||
"D:yyyyMMddHHmmzzz:", "D:yyyyMMddHHmm+", "D:yyyyMMddHHmm",
|
||||
"D:yyyyMMddHHzzz:", "D:yyyyMMddHH+", "D:yyyyMMddHH",
|
||||
"D:yyyyMMdd", "D:yyyyMM", "D:yyyy"
|
||||
];
|
||||
|
||||
public PdfComicInfoExtractor(ILogger<BookService> logger, IMediaErrorService mediaErrorService)
|
||||
{
|
||||
_logger = logger;
|
||||
_mediaErrorService = mediaErrorService;
|
||||
}
|
||||
|
||||
private float? GetFloatFromText(string? text)
|
||||
{
|
||||
if (string.IsNullOrEmpty(text)) return null;
|
||||
|
||||
if (float.TryParse(text, out var value)) return value;
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private DateTime? GetDateTimeFromText(string? text)
|
||||
{
|
||||
if (string.IsNullOrEmpty(text)) return null;
|
||||
|
||||
// Dates stored in the XMP metadata stream (PDF Spec 14.3.2)
|
||||
// are stored in ISO 8601 format, which is handled by C# out of the box
|
||||
if (DateTime.TryParse(text, out var date)) return date;
|
||||
|
||||
// Dates stored in the document information directory (PDF Spec 14.3.3)
|
||||
// are stored in a proprietary format (PDF Spec 7.9.4) that needs to be
|
||||
// massaged slightly to be expressible by a DateTime format.
|
||||
if (text[0] != 'D') {
|
||||
text = "D:" + text;
|
||||
}
|
||||
text = text.Replace("'", ":");
|
||||
text = text.Replace("Z", "+");
|
||||
|
||||
foreach(var format in _pdfDateFormats)
|
||||
{
|
||||
if (DateTime.TryParseExact(text, format, null, System.Globalization.DateTimeStyles.None, out var pdfDate)) return pdfDate;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private string? MaybeGetMetadata(Dictionary<string, string> metadata, string key)
|
||||
{
|
||||
return metadata.ContainsKey(key) ? metadata[key] : null;
|
||||
}
|
||||
|
||||
private ComicInfo? GetComicInfoFromMetadata(Dictionary<string, string> metadata, string filePath)
|
||||
{
|
||||
var info = new ComicInfo();
|
||||
|
||||
var publicationDate = GetDateTimeFromText(MaybeGetMetadata(metadata, "CreationDate"));
|
||||
|
||||
if (publicationDate != null)
|
||||
{
|
||||
info.Year = publicationDate.Value.Year;
|
||||
info.Month = publicationDate.Value.Month;
|
||||
info.Day = publicationDate.Value.Day;
|
||||
}
|
||||
|
||||
info.Summary = MaybeGetMetadata(metadata, "Summary") ?? string.Empty;
|
||||
info.Publisher = MaybeGetMetadata(metadata, "Publisher") ?? string.Empty;
|
||||
info.Writer = MaybeGetMetadata(metadata, "Author") ?? string.Empty;
|
||||
info.Title = MaybeGetMetadata(metadata, "Title") ?? string.Empty;
|
||||
info.Genre = MaybeGetMetadata(metadata, "Subject") ?? string.Empty;
|
||||
info.LanguageISO = BookService.ValidateLanguage(MaybeGetMetadata(metadata, "Language"));
|
||||
info.Isbn = MaybeGetMetadata(metadata, "ISBN") ?? string.Empty;
|
||||
|
||||
if (info.Isbn != string.Empty && !ArticleNumberHelper.IsValidIsbn10(info.Isbn) && !ArticleNumberHelper.IsValidIsbn13(info.Isbn))
|
||||
{
|
||||
_logger.LogDebug("[BookService] {File} has an invalid ISBN number", filePath);
|
||||
info.Isbn = string.Empty;
|
||||
}
|
||||
|
||||
info.UserRating = GetFloatFromText(MaybeGetMetadata(metadata, "UserRating")) ?? 0.0f;
|
||||
info.TitleSort = MaybeGetMetadata(metadata, "TitleSort") ?? string.Empty;
|
||||
info.Series = MaybeGetMetadata(metadata, "Series") ?? info.TitleSort;
|
||||
info.SeriesSort = info.Series;
|
||||
info.Volume = (GetFloatFromText(MaybeGetMetadata(metadata, "Volume")) ?? 0.0f).ToString();
|
||||
|
||||
// If this is a single book and not a collection, set publication status to Completed
|
||||
if (string.IsNullOrEmpty(info.Volume) && Parser.ParseVolume(filePath, LibraryType.Manga).Equals(Parser.LooseLeafVolume))
|
||||
{
|
||||
info.Count = 1;
|
||||
}
|
||||
|
||||
// Removed as probably unneeded per discussion in https://github.com/Kareadita/Kavita/pull/3108#discussion_r1956747782
|
||||
//
|
||||
// var hasVolumeInSeries = !Parser.ParseVolume(info.Title, LibraryType.Manga)
|
||||
// .Equals(Parser.LooseLeafVolume);
|
||||
|
||||
// if (string.IsNullOrEmpty(info.Volume) && hasVolumeInSeries && (!info.Series.Equals(info.Title) || string.IsNullOrEmpty(info.Series)))
|
||||
// {
|
||||
// // This is likely a light novel for which we can set series from parsed title
|
||||
// info.Series = Parser.ParseSeries(info.Title, LibraryType.Manga);
|
||||
// info.Volume = Parser.ParseVolume(info.Title, LibraryType.Manga);
|
||||
// }
|
||||
|
||||
ComicInfo.CleanComicInfo(info);
|
||||
|
||||
return info;
|
||||
}
|
||||
|
||||
public ComicInfo? GetComicInfo(string filePath)
|
||||
{
|
||||
try
|
||||
{
|
||||
var extractor = new PdfMetadataExtractor(_logger, filePath);
|
||||
|
||||
return GetComicInfoFromMetadata(extractor.GetMetadata(), filePath);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "[GetComicInfo] There was an exception parsing PDF metadata for {File}", filePath);
|
||||
_mediaErrorService.ReportMediaIssue(filePath, MediaErrorProducer.BookService,
|
||||
"There was an exception parsing PDF metadata", ex);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
||||
1660
API/Helpers/PdfMetadataExtractor.cs
Normal file
1660
API/Helpers/PdfMetadataExtractor.cs
Normal file
File diff suppressed because it is too large
Load diff
|
|
@ -6,12 +6,14 @@ using System.Linq;
|
|||
using System.Text;
|
||||
using System.Text.RegularExpressions;
|
||||
using System.Threading.Tasks;
|
||||
using System.Xml;
|
||||
using API.Data.Metadata;
|
||||
using API.DTOs.Reader;
|
||||
using API.Entities;
|
||||
using API.Entities.Enums;
|
||||
using API.Extensions;
|
||||
using API.Services.Tasks.Scanner.Parser;
|
||||
using API.Helpers;
|
||||
using Docnet.Core;
|
||||
using Docnet.Core.Converters;
|
||||
using Docnet.Core.Models;
|
||||
|
|
@ -69,6 +71,8 @@ public class BookService : IBookService
|
|||
private static readonly RecyclableMemoryStreamManager StreamManager = new ();
|
||||
private const string CssScopeClass = ".book-content";
|
||||
private const string BookApiUrl = "book-resources?file=";
|
||||
private readonly PdfComicInfoExtractor _pdfComicInfoExtractor;
|
||||
|
||||
public static readonly EpubReaderOptions BookReaderOptions = new()
|
||||
{
|
||||
PackageReaderOptions = new PackageReaderOptions
|
||||
|
|
@ -84,6 +88,7 @@ public class BookService : IBookService
|
|||
_directoryService = directoryService;
|
||||
_imageService = imageService;
|
||||
_mediaErrorService = mediaErrorService;
|
||||
_pdfComicInfoExtractor = new PdfComicInfoExtractor(_logger, _mediaErrorService);
|
||||
}
|
||||
|
||||
private static bool HasClickableHrefPart(HtmlNode anchor)
|
||||
|
|
@ -425,10 +430,8 @@ public class BookService : IBookService
|
|||
}
|
||||
}
|
||||
|
||||
public ComicInfo? GetComicInfo(string filePath)
|
||||
private ComicInfo? GetEpubComicInfo(string filePath)
|
||||
{
|
||||
if (!IsValidFile(filePath) || Parser.IsPdf(filePath)) return null;
|
||||
|
||||
try
|
||||
{
|
||||
using var epubBook = EpubReader.OpenBook(filePath, BookReaderOptions);
|
||||
|
|
@ -442,7 +445,7 @@ public class BookService : IBookService
|
|||
var (year, month, day) = GetPublicationDate(publicationDate);
|
||||
|
||||
var summary = epubBook.Schema.Package.Metadata.Descriptions.FirstOrDefault();
|
||||
var info = new ComicInfo
|
||||
var info = new ComicInfo
|
||||
{
|
||||
Summary = string.IsNullOrEmpty(summary?.Description) ? string.Empty : summary.Description,
|
||||
Publisher = string.Join(",", epubBook.Schema.Package.Metadata.Publishers.Select(p => p.Publisher)),
|
||||
|
|
@ -583,6 +586,20 @@ public class BookService : IBookService
|
|||
return null;
|
||||
}
|
||||
|
||||
public ComicInfo? GetComicInfo(string filePath)
|
||||
{
|
||||
if (!IsValidFile(filePath)) return null;
|
||||
|
||||
if (Parser.IsPdf(filePath))
|
||||
{
|
||||
return _pdfComicInfoExtractor.GetComicInfo(filePath);
|
||||
}
|
||||
else
|
||||
{
|
||||
return GetEpubComicInfo(filePath);
|
||||
}
|
||||
}
|
||||
|
||||
private static void ExtractSortTitle(EpubMetadataMeta metadataItem, EpubBookRef epubBook, ComicInfo info)
|
||||
{
|
||||
var titleId = metadataItem.Refines?.Replace("#", string.Empty);
|
||||
|
|
@ -685,7 +702,7 @@ public class BookService : IBookService
|
|||
return (year, month, day);
|
||||
}
|
||||
|
||||
private static string ValidateLanguage(string? language)
|
||||
public static string ValidateLanguage(string? language)
|
||||
{
|
||||
if (string.IsNullOrEmpty(language)) return string.Empty;
|
||||
|
||||
|
|
|
|||
|
|
@ -566,7 +566,6 @@ public class ExternalMetadataService : IExternalMetadataService
|
|||
return false;
|
||||
}
|
||||
|
||||
var relatedSeriesDict = new Dictionary<int, Series>();
|
||||
foreach (var relation in externalMetadataRelations)
|
||||
{
|
||||
var names = new [] {relation.SeriesName.PreferredTitle, relation.SeriesName.RomajiTitle, relation.SeriesName.EnglishTitle, relation.SeriesName.NativeTitle};
|
||||
|
|
@ -586,19 +585,6 @@ public class ExternalMetadataService : IExternalMetadataService
|
|||
|
||||
if (relationshipExists) continue;
|
||||
|
||||
relatedSeriesDict[relatedSeries.Id] = relatedSeries;
|
||||
}
|
||||
|
||||
// Process relationships
|
||||
foreach (var relation in externalMetadataRelations)
|
||||
{
|
||||
var relatedSeries = relatedSeriesDict.GetValueOrDefault(
|
||||
relatedSeriesDict.Keys.FirstOrDefault(k =>
|
||||
relatedSeriesDict[k].Name == relation.SeriesName.PreferredTitle ||
|
||||
relatedSeriesDict[k].Name == relation.SeriesName.NativeTitle));
|
||||
|
||||
if (relatedSeries == null) continue;
|
||||
|
||||
// Add new relationship
|
||||
var newRelation = new SeriesRelation
|
||||
{
|
||||
|
|
@ -969,7 +955,7 @@ public class ExternalMetadataService : IExternalMetadataService
|
|||
return false;
|
||||
}
|
||||
|
||||
if (!string.IsNullOrEmpty(externalMetadata.CoverUrl) && !settings.HasOverride(MetadataSettingField.Covers))
|
||||
if (string.IsNullOrEmpty(externalMetadata.CoverUrl))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -52,7 +52,7 @@ public class ReadingItemService : IReadingItemService
|
|||
/// <returns></returns>
|
||||
private ComicInfo? GetComicInfo(string filePath)
|
||||
{
|
||||
if (Parser.IsEpub(filePath))
|
||||
if (Parser.IsEpub(filePath) || Parser.IsPdf(filePath))
|
||||
{
|
||||
return _bookService.GetComicInfo(filePath);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -68,6 +68,9 @@ public class PdfParser(IDirectoryService directoryService) : DefaultParser(direc
|
|||
ParseFromFallbackFolders(filePath, tempRootPath, type, ref ret);
|
||||
}
|
||||
|
||||
// Patch in other information from ComicInfo
|
||||
UpdateFromComicInfo(ret);
|
||||
|
||||
if (ret.Chapters == Parser.DefaultChapter && ret.Volumes == Parser.LooseLeafVolume && type == LibraryType.Book)
|
||||
{
|
||||
ret.IsSpecial = true;
|
||||
|
|
|
|||
|
|
@ -285,7 +285,7 @@ public class ProcessSeries : IProcessSeries
|
|||
var firstChapter = SeriesService.GetFirstChapterForMetadata(series);
|
||||
|
||||
var firstFile = firstChapter?.Files.FirstOrDefault();
|
||||
if (firstFile == null || Parser.Parser.IsPdf(firstFile.FilePath)) return;
|
||||
if (firstFile == null) return;
|
||||
|
||||
var chapters = series.Volumes
|
||||
.SelectMany(volume => volume.Chapters)
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
"TokenKey": "super secret unguessable key that is longer because we require it",
|
||||
"Port": 5000,
|
||||
"IpAddresses": "0.0.0.0,::",
|
||||
"BaseUrl": "/test/",
|
||||
"BaseUrl": "/",
|
||||
"Cache": 75,
|
||||
"AllowIFraming": false
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue