Hooked in character counts per page for estimation, needs some cleanup.

This commit is contained in:
Joseph Milazzo 2025-07-08 06:06:35 -05:00
parent 9b7eb11359
commit ab6669703d
8 changed files with 105 additions and 27 deletions

View file

@ -2,6 +2,7 @@
using System.IO; using System.IO;
using System.Linq; using System.Linq;
using System.Threading.Tasks; using System.Threading.Tasks;
using API.Constants;
using API.Data; using API.Data;
using API.DTOs.Reader; using API.DTOs.Reader;
using API.Entities.Enums; using API.Entities.Enums;
@ -40,11 +41,14 @@ public class BookController : BaseApiController
/// <param name="chapterId"></param> /// <param name="chapterId"></param>
/// <returns></returns> /// <returns></returns>
[HttpGet("{chapterId}/book-info")] [HttpGet("{chapterId}/book-info")]
public async Task<ActionResult<BookInfoDto>> GetBookInfo(int chapterId) [ResponseCache(CacheProfileName = ResponseCacheProfiles.Hour, VaryByQueryKeys = ["chapterId", "includeWordCounts"])]
public async Task<ActionResult<BookInfoDto>> GetBookInfo(int chapterId, bool includeWordCounts = false)
{ {
var dto = await _unitOfWork.ChapterRepository.GetChapterInfoDtoAsync(chapterId); var dto = await _unitOfWork.ChapterRepository.GetChapterInfoDtoAsync(chapterId);
if (dto == null) return BadRequest(await _localizationService.Translate(User.GetUserId(), "chapter-doesnt-exist")); if (dto == null) return BadRequest(await _localizationService.Translate(User.GetUserId(), "chapter-doesnt-exist"));
var bookTitle = string.Empty; var bookTitle = string.Empty;
IDictionary<int, int>? pageWordCounts = null;
switch (dto.SeriesFormat) switch (dto.SeriesFormat)
{ {
case MangaFormat.Epub: case MangaFormat.Epub:
@ -52,6 +56,12 @@ public class BookController : BaseApiController
var mangaFile = (await _unitOfWork.ChapterRepository.GetFilesForChapterAsync(chapterId))[0]; var mangaFile = (await _unitOfWork.ChapterRepository.GetFilesForChapterAsync(chapterId))[0];
using var book = await EpubReader.OpenBookAsync(mangaFile.FilePath, BookService.LenientBookReaderOptions); using var book = await EpubReader.OpenBookAsync(mangaFile.FilePath, BookService.LenientBookReaderOptions);
bookTitle = book.Title; bookTitle = book.Title;
if (includeWordCounts)
{
// TODO: Cache this in temp/chapterId folder to avoid having to process file each time
pageWordCounts = await _bookService.GetWordCountsPerPage(mangaFile.FilePath);
}
break; break;
} }
case MangaFormat.Pdf: case MangaFormat.Pdf:
@ -72,7 +82,7 @@ public class BookController : BaseApiController
break; break;
} }
return Ok(new BookInfoDto() var info = new BookInfoDto()
{ {
ChapterNumber = dto.ChapterNumber, ChapterNumber = dto.ChapterNumber,
VolumeNumber = dto.VolumeNumber, VolumeNumber = dto.VolumeNumber,
@ -84,7 +94,14 @@ public class BookController : BaseApiController
LibraryId = dto.LibraryId, LibraryId = dto.LibraryId,
IsSpecial = dto.IsSpecial, IsSpecial = dto.IsSpecial,
Pages = dto.Pages, Pages = dto.Pages,
}); PageWordCounts = pageWordCounts
};
return Ok(info);
} }
/// <summary> /// <summary>

View file

@ -41,6 +41,7 @@ public class ReaderController : BaseApiController
private readonly IEventHub _eventHub; private readonly IEventHub _eventHub;
private readonly IScrobblingService _scrobblingService; private readonly IScrobblingService _scrobblingService;
private readonly ILocalizationService _localizationService; private readonly ILocalizationService _localizationService;
private readonly IBookService _bookService;
/// <inheritdoc /> /// <inheritdoc />
public ReaderController(ICacheService cacheService, public ReaderController(ICacheService cacheService,
@ -48,7 +49,8 @@ public class ReaderController : BaseApiController
IReaderService readerService, IBookmarkService bookmarkService, IReaderService readerService, IBookmarkService bookmarkService,
IAccountService accountService, IEventHub eventHub, IAccountService accountService, IEventHub eventHub,
IScrobblingService scrobblingService, IScrobblingService scrobblingService,
ILocalizationService localizationService) ILocalizationService localizationService,
IBookService bookService)
{ {
_cacheService = cacheService; _cacheService = cacheService;
_unitOfWork = unitOfWork; _unitOfWork = unitOfWork;
@ -59,6 +61,7 @@ public class ReaderController : BaseApiController
_eventHub = eventHub; _eventHub = eventHub;
_scrobblingService = scrobblingService; _scrobblingService = scrobblingService;
_localizationService = localizationService; _localizationService = localizationService;
_bookService = bookService;
} }
/// <summary> /// <summary>
@ -218,11 +221,11 @@ public class ReaderController : BaseApiController
/// <remarks>This is generally the first call when attempting to read to allow pre-generation of assets needed for reading</remarks> /// <remarks>This is generally the first call when attempting to read to allow pre-generation of assets needed for reading</remarks>
/// <param name="chapterId"></param> /// <param name="chapterId"></param>
/// <param name="extractPdf">Should Kavita extract pdf into images. Defaults to false.</param> /// <param name="extractPdf">Should Kavita extract pdf into images. Defaults to false.</param>
/// <param name="includeDimensions">Include file dimensions. Only useful for image based reading</param> /// <param name="includeDimensions">Include file dimensions. Only useful for image-based reading</param>
/// <param name="includeWordCounts">Include epub word counts per page. Only useful for epub-based reading</param>
/// <returns></returns> /// <returns></returns>
[HttpGet("chapter-info")] [HttpGet("chapter-info")]
[ResponseCache(CacheProfileName = ResponseCacheProfiles.Hour, VaryByQueryKeys = ["chapterId", "extractPdf", "includeDimensions" [ResponseCache(CacheProfileName = ResponseCacheProfiles.Hour, VaryByQueryKeys = ["chapterId", "extractPdf", "includeDimensions"])]
])]
public async Task<ActionResult<ChapterInfoDto>> GetChapterInfo(int chapterId, bool extractPdf = false, bool includeDimensions = false) public async Task<ActionResult<ChapterInfoDto>> GetChapterInfo(int chapterId, bool extractPdf = false, bool includeDimensions = false)
{ {
if (chapterId <= 0) return Ok(null); // This can happen occasionally from UI, we should just ignore if (chapterId <= 0) return Ok(null); // This can happen occasionally from UI, we should just ignore
@ -846,6 +849,7 @@ public class ReaderController : BaseApiController
// Patch in the reading progress // Patch in the reading progress
await _unitOfWork.ChapterRepository.AddChapterModifiers(User.GetUserId(), chapter); await _unitOfWork.ChapterRepository.AddChapterModifiers(User.GetUserId(), chapter);
// TODO: We need to actually use word count from the pages
if (series.Format == MangaFormat.Epub) if (series.Format == MangaFormat.Epub)
{ {
var progressCount = chapter.WordCount; var progressCount = chapter.WordCount;

View file

@ -1,4 +1,5 @@
using API.Entities.Enums; using System.Collections.Generic;
using API.Entities.Enums;
namespace API.DTOs.Reader; namespace API.DTOs.Reader;
@ -15,4 +16,9 @@ public sealed record BookInfoDto : IChapterInfoDto
public int Pages { get; set; } public int Pages { get; set; }
public bool IsSpecial { get; set; } public bool IsSpecial { get; set; }
public string ChapterTitle { get; set; } = default! ; public string ChapterTitle { get; set; } = default! ;
/// <summary>
/// For Epub reader, this will contain Page number -> word count. All other times will be null.
/// </summary>
/// <remarks>This is optionally returned by includeWordCounts</remarks>
public IDictionary<int, int>? PageWordCounts { get; set; }
} }

View file

@ -14,6 +14,7 @@ using API.Entities.Enums;
using API.Extensions; using API.Extensions;
using API.Services.Tasks.Scanner.Parser; using API.Services.Tasks.Scanner.Parser;
using API.Helpers; using API.Helpers;
using API.Services.Tasks.Metadata;
using Docnet.Core; using Docnet.Core;
using Docnet.Core.Converters; using Docnet.Core.Converters;
using Docnet.Core.Models; using Docnet.Core.Models;
@ -59,6 +60,7 @@ public interface IBookService
Task<ICollection<BookChapterItem>> GenerateTableOfContents(Chapter chapter); Task<ICollection<BookChapterItem>> GenerateTableOfContents(Chapter chapter);
Task<string> GetBookPage(int page, int chapterId, string cachedEpubPath, string baseUrl, List<PersonalToCDto> ptocBookmarks, List<AnnotationDto> annotations); Task<string> GetBookPage(int page, int chapterId, string cachedEpubPath, string baseUrl, List<PersonalToCDto> ptocBookmarks, List<AnnotationDto> annotations);
Task<Dictionary<string, int>> CreateKeyToPageMappingAsync(EpubBookRef book); Task<Dictionary<string, int>> CreateKeyToPageMappingAsync(EpubBookRef book);
Task<IDictionary<int, int>?> GetWordCountsPerPage(string bookFilePath);
} }
public class BookService : IBookService public class BookService : IBookService
@ -955,6 +957,50 @@ public class BookService : IBookService
return dict; return dict;
} }
public async Task<IDictionary<int, int>?> GetWordCountsPerPage(string bookFilePath)
{
var ret = new Dictionary<int, int>();
try
{
using var book = await EpubReader.OpenBookAsync(bookFilePath, LenientBookReaderOptions);
var mappings = await CreateKeyToPageMappingAsync(book);
var doc = new HtmlDocument {OptionFixNestedTags = true};
var bookPages = await book.GetReadingOrderAsync();
foreach (var contentFileRef in bookPages)
{
var page = mappings[contentFileRef.Key];
var content = await contentFileRef.ReadContentAsync();
doc.LoadHtml(content);
var body = doc.DocumentNode.SelectSingleNode("//body");
if (body == null)
{
_logger.LogError("{FilePath} has no body tag! Generating one for support. Book may be skewed", book.FilePath);
doc.DocumentNode.SelectSingleNode("/html").AppendChild(HtmlNode.CreateNode("<body></body>"));
body = doc.DocumentNode.SelectSingleNode("//html/body");
}
// Find all words in the html body
// TEMP: REfactor this to use WordCountAnalyzerService
var textNodes = body!.SelectNodes("//text()[not(parent::script)]");
ret.Add(page, textNodes?.Sum(node => node.InnerText.Count(char.IsLetter)) ?? 0);
}
}
catch (Exception ex)
{
_logger.LogError(ex, "There was an issue calculating word counts per page");
return null;
}
return ret;
}
/// <summary> /// <summary>
/// Parses out Title from book. Chapters and Volumes will always be "0". If there is any exception reading book (malformed books) /// Parses out Title from book. Chapters and Volumes will always be "0". If there is any exception reading book (malformed books)
/// then null is returned. This expects only an epub file /// then null is returned. This expects only an epub file

View file

@ -35,7 +35,7 @@ public class WordCountAnalyzerService : IWordCountAnalyzerService
private readonly IReaderService _readerService; private readonly IReaderService _readerService;
private readonly IMediaErrorService _mediaErrorService; private readonly IMediaErrorService _mediaErrorService;
private const int AverageCharactersPerWord = 5; public const int AverageCharactersPerWord = 5;
public WordCountAnalyzerService(ILogger<WordCountAnalyzerService> logger, IUnitOfWork unitOfWork, IEventHub eventHub, public WordCountAnalyzerService(ILogger<WordCountAnalyzerService> logger, IUnitOfWork unitOfWork, IEventHub eventHub,
ICacheHelper cacheHelper, IReaderService readerService, IMediaErrorService mediaErrorService) ICacheHelper cacheHelper, IReaderService readerService, IMediaErrorService mediaErrorService)

View file

@ -662,7 +662,7 @@ export class BookReaderComponent implements OnInit, AfterViewInit, OnDestroy {
this.cdRef.markForCheck(); this.cdRef.markForCheck();
this.bookService.getBookInfo(this.chapterId).subscribe(async (info) => { this.bookService.getBookInfo(this.chapterId, true).subscribe(async (info) => {
if (this.readingListMode && info.seriesFormat !== MangaFormat.EPUB) { if (this.readingListMode && info.seriesFormat !== MangaFormat.EPUB) {
// Redirect to the manga reader. // Redirect to the manga reader.
const params = this.readerService.getQueryParamsObject(this.incognitoMode, this.readingListMode, this.readingListId); const params = this.readerService.getQueryParamsObject(this.incognitoMode, this.readingListMode, this.readingListId);

View file

@ -1,4 +1,4 @@
import { MangaFormat } from "src/app/_models/manga-format"; import {MangaFormat} from "src/app/_models/manga-format";
export interface BookInfo { export interface BookInfo {
bookTitle: string; bookTitle: string;
@ -6,4 +6,8 @@ export interface BookInfo {
seriesId: number; seriesId: number;
libraryId: number; libraryId: number;
volumeId: number; volumeId: number;
/**
* Maps the page number to character count. Only available on epub reader.
*/
pageWordCounts: {[key: number]: number};
} }

View file

@ -1,9 +1,9 @@
import { HttpClient } from '@angular/common/http'; import {HttpClient} from '@angular/common/http';
import { Injectable } from '@angular/core'; import {Injectable} from '@angular/core';
import { TextResonse } from 'src/app/_types/text-response'; import {TextResonse} from 'src/app/_types/text-response';
import { environment } from 'src/environments/environment'; import {environment} from 'src/environments/environment';
import { BookChapterItem } from '../_models/book-chapter-item'; import {BookChapterItem} from '../_models/book-chapter-item';
import { BookInfo } from '../_models/book-info'; import {BookInfo} from '../_models/book-info';
export interface FontFamily { export interface FontFamily {
/** /**
@ -28,7 +28,8 @@ export class BookService {
getFontFamilies(): Array<FontFamily> { getFontFamilies(): Array<FontFamily> {
return [{title: 'default', family: 'default'}, {title: 'EBGaramond', family: 'EBGaramond'}, {title: 'Fira Sans', family: 'Fira_Sans'}, return [{title: 'default', family: 'default'}, {title: 'EBGaramond', family: 'EBGaramond'}, {title: 'Fira Sans', family: 'Fira_Sans'},
{title: 'Lato', family: 'Lato'}, {title: 'Libre Baskerville', family: 'Libre_Baskerville'}, {title: 'Merriweather', family: 'Merriweather'}, {title: 'Lato', family: 'Lato'}, {title: 'Libre Baskerville', family: 'Libre_Baskerville'}, {title: 'Merriweather', family: 'Merriweather'},
{title: 'Nanum Gothic', family: 'Nanum_Gothic'}, {title: 'Open Dyslexic', family: 'OpenDyslexic2'}, {title: 'RocknRoll One', family: 'RocknRoll_One'}, {title: 'Fast Font Serif (Bionic)', family: 'FastFontSerif'}, {title: 'Fast Font Sans (Bionic)', family: 'FastFontSans'}]; {title: 'Nanum Gothic', family: 'Nanum_Gothic'}, {title: 'Open Dyslexic', family: 'OpenDyslexic2'}, {title: 'RocknRoll One', family: 'RocknRoll_One'},
{title: 'Fast Font Serif (Bionic)', family: 'FastFontSerif'}, {title: 'Fast Font Sans (Bionic)', family: 'FastFontSans'}];
} }
getBookChapters(chapterId: number) { getBookChapters(chapterId: number) {
@ -39,8 +40,8 @@ export class BookService {
return this.http.get<string>(this.baseUrl + 'book/' + chapterId + '/book-page?page=' + page, TextResonse); return this.http.get<string>(this.baseUrl + 'book/' + chapterId + '/book-page?page=' + page, TextResonse);
} }
getBookInfo(chapterId: number) { getBookInfo(chapterId: number, includeWordCounts: boolean = false) {
return this.http.get<BookInfo>(this.baseUrl + 'book/' + chapterId + '/book-info'); return this.http.get<BookInfo>(this.baseUrl + `book/${chapterId}/book-info?includeWordCounts=${includeWordCounts}`);
} }
getBookPageUrl(chapterId: number, page: number) { getBookPageUrl(chapterId: number, page: number) {