Hooked in character counts per page for estimation, needs some cleanup.

This commit is contained in:
Joseph Milazzo 2025-07-08 06:06:35 -05:00
parent 9b7eb11359
commit ab6669703d
8 changed files with 105 additions and 27 deletions

View file

@ -14,6 +14,7 @@ using API.Entities.Enums;
using API.Extensions;
using API.Services.Tasks.Scanner.Parser;
using API.Helpers;
using API.Services.Tasks.Metadata;
using Docnet.Core;
using Docnet.Core.Converters;
using Docnet.Core.Models;
@ -59,6 +60,7 @@ public interface IBookService
Task<ICollection<BookChapterItem>> GenerateTableOfContents(Chapter chapter);
Task<string> GetBookPage(int page, int chapterId, string cachedEpubPath, string baseUrl, List<PersonalToCDto> ptocBookmarks, List<AnnotationDto> annotations);
Task<Dictionary<string, int>> CreateKeyToPageMappingAsync(EpubBookRef book);
Task<IDictionary<int, int>?> GetWordCountsPerPage(string bookFilePath);
}
public class BookService : IBookService
@ -955,6 +957,50 @@ public class BookService : IBookService
return dict;
}
public async Task<IDictionary<int, int>?> GetWordCountsPerPage(string bookFilePath)
{
var ret = new Dictionary<int, int>();
try
{
using var book = await EpubReader.OpenBookAsync(bookFilePath, LenientBookReaderOptions);
var mappings = await CreateKeyToPageMappingAsync(book);
var doc = new HtmlDocument {OptionFixNestedTags = true};
var bookPages = await book.GetReadingOrderAsync();
foreach (var contentFileRef in bookPages)
{
var page = mappings[contentFileRef.Key];
var content = await contentFileRef.ReadContentAsync();
doc.LoadHtml(content);
var body = doc.DocumentNode.SelectSingleNode("//body");
if (body == null)
{
_logger.LogError("{FilePath} has no body tag! Generating one for support. Book may be skewed", book.FilePath);
doc.DocumentNode.SelectSingleNode("/html").AppendChild(HtmlNode.CreateNode("<body></body>"));
body = doc.DocumentNode.SelectSingleNode("//html/body");
}
// Find all words in the html body
// TEMP: REfactor this to use WordCountAnalyzerService
var textNodes = body!.SelectNodes("//text()[not(parent::script)]");
ret.Add(page, textNodes?.Sum(node => node.InnerText.Count(char.IsLetter)) ?? 0);
}
}
catch (Exception ex)
{
_logger.LogError(ex, "There was an issue calculating word counts per page");
return null;
}
return ret;
}
/// <summary>
/// Parses out Title from book. Chapters and Volumes will always be "0". If there is any exception reading book (malformed books)
/// then null is returned. This expects only an epub file