diff --git a/API/Controllers/BookController.cs b/API/Controllers/BookController.cs index 453a4ef65..e727d8c1a 100644 --- a/API/Controllers/BookController.cs +++ b/API/Controllers/BookController.cs @@ -2,6 +2,7 @@ using System.IO; using System.Linq; using System.Threading.Tasks; +using API.Constants; using API.Data; using API.DTOs.Reader; using API.Entities.Enums; @@ -40,11 +41,14 @@ public class BookController : BaseApiController /// /// [HttpGet("{chapterId}/book-info")] - public async Task> GetBookInfo(int chapterId) + [ResponseCache(CacheProfileName = ResponseCacheProfiles.Hour, VaryByQueryKeys = ["chapterId", "includeWordCounts"])] + public async Task> GetBookInfo(int chapterId, bool includeWordCounts = false) { var dto = await _unitOfWork.ChapterRepository.GetChapterInfoDtoAsync(chapterId); if (dto == null) return BadRequest(await _localizationService.Translate(User.GetUserId(), "chapter-doesnt-exist")); var bookTitle = string.Empty; + IDictionary? pageWordCounts = null; + switch (dto.SeriesFormat) { case MangaFormat.Epub: @@ -52,6 +56,12 @@ public class BookController : BaseApiController var mangaFile = (await _unitOfWork.ChapterRepository.GetFilesForChapterAsync(chapterId))[0]; using var book = await EpubReader.OpenBookAsync(mangaFile.FilePath, BookService.LenientBookReaderOptions); bookTitle = book.Title; + + if (includeWordCounts) + { + // TODO: Cache this in temp/chapterId folder to avoid having to process file each time + pageWordCounts = await _bookService.GetWordCountsPerPage(mangaFile.FilePath); + } break; } case MangaFormat.Pdf: @@ -72,9 +82,9 @@ public class BookController : BaseApiController break; } - return Ok(new BookInfoDto() + var info = new BookInfoDto() { - ChapterNumber = dto.ChapterNumber, + ChapterNumber = dto.ChapterNumber, VolumeNumber = dto.VolumeNumber, VolumeId = dto.VolumeId, BookTitle = bookTitle, @@ -84,7 +94,14 @@ public class BookController : BaseApiController LibraryId = dto.LibraryId, IsSpecial = dto.IsSpecial, Pages = dto.Pages, - }); + PageWordCounts = pageWordCounts + }; + + + + + + return Ok(info); } /// diff --git a/API/Controllers/ReaderController.cs b/API/Controllers/ReaderController.cs index 3cc9f5285..1071ba46c 100644 --- a/API/Controllers/ReaderController.cs +++ b/API/Controllers/ReaderController.cs @@ -41,6 +41,7 @@ public class ReaderController : BaseApiController private readonly IEventHub _eventHub; private readonly IScrobblingService _scrobblingService; private readonly ILocalizationService _localizationService; + private readonly IBookService _bookService; /// public ReaderController(ICacheService cacheService, @@ -48,7 +49,8 @@ public class ReaderController : BaseApiController IReaderService readerService, IBookmarkService bookmarkService, IAccountService accountService, IEventHub eventHub, IScrobblingService scrobblingService, - ILocalizationService localizationService) + ILocalizationService localizationService, + IBookService bookService) { _cacheService = cacheService; _unitOfWork = unitOfWork; @@ -59,6 +61,7 @@ public class ReaderController : BaseApiController _eventHub = eventHub; _scrobblingService = scrobblingService; _localizationService = localizationService; + _bookService = bookService; } /// @@ -218,11 +221,11 @@ public class ReaderController : BaseApiController /// This is generally the first call when attempting to read to allow pre-generation of assets needed for reading /// /// Should Kavita extract pdf into images. Defaults to false. - /// Include file dimensions. Only useful for image based reading + /// Include file dimensions. Only useful for image-based reading + /// Include epub word counts per page. Only useful for epub-based reading /// [HttpGet("chapter-info")] - [ResponseCache(CacheProfileName = ResponseCacheProfiles.Hour, VaryByQueryKeys = ["chapterId", "extractPdf", "includeDimensions" - ])] + [ResponseCache(CacheProfileName = ResponseCacheProfiles.Hour, VaryByQueryKeys = ["chapterId", "extractPdf", "includeDimensions"])] public async Task> GetChapterInfo(int chapterId, bool extractPdf = false, bool includeDimensions = false) { if (chapterId <= 0) return Ok(null); // This can happen occasionally from UI, we should just ignore @@ -846,6 +849,7 @@ public class ReaderController : BaseApiController // Patch in the reading progress await _unitOfWork.ChapterRepository.AddChapterModifiers(User.GetUserId(), chapter); + // TODO: We need to actually use word count from the pages if (series.Format == MangaFormat.Epub) { var progressCount = chapter.WordCount; diff --git a/API/DTOs/Reader/BookInfoDto.cs b/API/DTOs/Reader/BookInfoDto.cs index 2473cd5dc..7f56ca160 100644 --- a/API/DTOs/Reader/BookInfoDto.cs +++ b/API/DTOs/Reader/BookInfoDto.cs @@ -1,4 +1,5 @@ -using API.Entities.Enums; +using System.Collections.Generic; +using API.Entities.Enums; namespace API.DTOs.Reader; @@ -15,4 +16,9 @@ public sealed record BookInfoDto : IChapterInfoDto public int Pages { get; set; } public bool IsSpecial { get; set; } public string ChapterTitle { get; set; } = default! ; + /// + /// For Epub reader, this will contain Page number -> word count. All other times will be null. + /// + /// This is optionally returned by includeWordCounts + public IDictionary? PageWordCounts { get; set; } } diff --git a/API/Services/BookService.cs b/API/Services/BookService.cs index f4d07685b..f51532671 100644 --- a/API/Services/BookService.cs +++ b/API/Services/BookService.cs @@ -14,6 +14,7 @@ using API.Entities.Enums; using API.Extensions; using API.Services.Tasks.Scanner.Parser; using API.Helpers; +using API.Services.Tasks.Metadata; using Docnet.Core; using Docnet.Core.Converters; using Docnet.Core.Models; @@ -59,6 +60,7 @@ public interface IBookService Task> GenerateTableOfContents(Chapter chapter); Task GetBookPage(int page, int chapterId, string cachedEpubPath, string baseUrl, List ptocBookmarks, List annotations); Task> CreateKeyToPageMappingAsync(EpubBookRef book); + Task?> GetWordCountsPerPage(string bookFilePath); } public class BookService : IBookService @@ -955,6 +957,50 @@ public class BookService : IBookService return dict; } + public async Task?> GetWordCountsPerPage(string bookFilePath) + { + var ret = new Dictionary(); + try + { + using var book = await EpubReader.OpenBookAsync(bookFilePath, LenientBookReaderOptions); + var mappings = await CreateKeyToPageMappingAsync(book); + + var doc = new HtmlDocument {OptionFixNestedTags = true}; + + + var bookPages = await book.GetReadingOrderAsync(); + foreach (var contentFileRef in bookPages) + { + var page = mappings[contentFileRef.Key]; + var content = await contentFileRef.ReadContentAsync(); + doc.LoadHtml(content); + + var body = doc.DocumentNode.SelectSingleNode("//body"); + + if (body == null) + { + _logger.LogError("{FilePath} has no body tag! Generating one for support. Book may be skewed", book.FilePath); + doc.DocumentNode.SelectSingleNode("/html").AppendChild(HtmlNode.CreateNode("")); + body = doc.DocumentNode.SelectSingleNode("//html/body"); + } + + // Find all words in the html body + // TEMP: REfactor this to use WordCountAnalyzerService + var textNodes = body!.SelectNodes("//text()[not(parent::script)]"); + ret.Add(page, textNodes?.Sum(node => node.InnerText.Count(char.IsLetter)) ?? 0); + + } + + } + catch (Exception ex) + { + _logger.LogError(ex, "There was an issue calculating word counts per page"); + return null; + } + + return ret; + } + /// /// Parses out Title from book. Chapters and Volumes will always be "0". If there is any exception reading book (malformed books) /// then null is returned. This expects only an epub file diff --git a/API/Services/Tasks/Metadata/WordCountAnalyzerService.cs b/API/Services/Tasks/Metadata/WordCountAnalyzerService.cs index bff7001bd..a1c1e4e02 100644 --- a/API/Services/Tasks/Metadata/WordCountAnalyzerService.cs +++ b/API/Services/Tasks/Metadata/WordCountAnalyzerService.cs @@ -35,7 +35,7 @@ public class WordCountAnalyzerService : IWordCountAnalyzerService private readonly IReaderService _readerService; private readonly IMediaErrorService _mediaErrorService; - private const int AverageCharactersPerWord = 5; + public const int AverageCharactersPerWord = 5; public WordCountAnalyzerService(ILogger logger, IUnitOfWork unitOfWork, IEventHub eventHub, ICacheHelper cacheHelper, IReaderService readerService, IMediaErrorService mediaErrorService) diff --git a/UI/Web/src/app/book-reader/_components/book-reader/book-reader.component.ts b/UI/Web/src/app/book-reader/_components/book-reader/book-reader.component.ts index 057e9d848..c33768b82 100644 --- a/UI/Web/src/app/book-reader/_components/book-reader/book-reader.component.ts +++ b/UI/Web/src/app/book-reader/_components/book-reader/book-reader.component.ts @@ -662,7 +662,7 @@ export class BookReaderComponent implements OnInit, AfterViewInit, OnDestroy { this.cdRef.markForCheck(); - this.bookService.getBookInfo(this.chapterId).subscribe(async (info) => { + this.bookService.getBookInfo(this.chapterId, true).subscribe(async (info) => { if (this.readingListMode && info.seriesFormat !== MangaFormat.EPUB) { // Redirect to the manga reader. const params = this.readerService.getQueryParamsObject(this.incognitoMode, this.readingListMode, this.readingListId); diff --git a/UI/Web/src/app/book-reader/_models/book-info.ts b/UI/Web/src/app/book-reader/_models/book-info.ts index 4816bd324..e564bf84d 100644 --- a/UI/Web/src/app/book-reader/_models/book-info.ts +++ b/UI/Web/src/app/book-reader/_models/book-info.ts @@ -1,9 +1,13 @@ -import { MangaFormat } from "src/app/_models/manga-format"; +import {MangaFormat} from "src/app/_models/manga-format"; export interface BookInfo { - bookTitle: string; - seriesFormat: MangaFormat; - seriesId: number; - libraryId: number; - volumeId: number; -} \ No newline at end of file + bookTitle: string; + seriesFormat: MangaFormat; + seriesId: number; + libraryId: number; + volumeId: number; + /** + * Maps the page number to character count. Only available on epub reader. + */ + pageWordCounts: {[key: number]: number}; +} diff --git a/UI/Web/src/app/book-reader/_services/book.service.ts b/UI/Web/src/app/book-reader/_services/book.service.ts index d98f09f38..5dd433482 100644 --- a/UI/Web/src/app/book-reader/_services/book.service.ts +++ b/UI/Web/src/app/book-reader/_services/book.service.ts @@ -1,9 +1,9 @@ -import { HttpClient } from '@angular/common/http'; -import { Injectable } from '@angular/core'; -import { TextResonse } from 'src/app/_types/text-response'; -import { environment } from 'src/environments/environment'; -import { BookChapterItem } from '../_models/book-chapter-item'; -import { BookInfo } from '../_models/book-info'; +import {HttpClient} from '@angular/common/http'; +import {Injectable} from '@angular/core'; +import {TextResonse} from 'src/app/_types/text-response'; +import {environment} from 'src/environments/environment'; +import {BookChapterItem} from '../_models/book-chapter-item'; +import {BookInfo} from '../_models/book-info'; export interface FontFamily { /** @@ -28,7 +28,8 @@ export class BookService { getFontFamilies(): Array { return [{title: 'default', family: 'default'}, {title: 'EBGaramond', family: 'EBGaramond'}, {title: 'Fira Sans', family: 'Fira_Sans'}, {title: 'Lato', family: 'Lato'}, {title: 'Libre Baskerville', family: 'Libre_Baskerville'}, {title: 'Merriweather', family: 'Merriweather'}, - {title: 'Nanum Gothic', family: 'Nanum_Gothic'}, {title: 'Open Dyslexic', family: 'OpenDyslexic2'}, {title: 'RocknRoll One', family: 'RocknRoll_One'}, {title: 'Fast Font Serif (Bionic)', family: 'FastFontSerif'}, {title: 'Fast Font Sans (Bionic)', family: 'FastFontSans'}]; + {title: 'Nanum Gothic', family: 'Nanum_Gothic'}, {title: 'Open Dyslexic', family: 'OpenDyslexic2'}, {title: 'RocknRoll One', family: 'RocknRoll_One'}, + {title: 'Fast Font Serif (Bionic)', family: 'FastFontSerif'}, {title: 'Fast Font Sans (Bionic)', family: 'FastFontSans'}]; } getBookChapters(chapterId: number) { @@ -39,8 +40,8 @@ export class BookService { return this.http.get(this.baseUrl + 'book/' + chapterId + '/book-page?page=' + page, TextResonse); } - getBookInfo(chapterId: number) { - return this.http.get(this.baseUrl + 'book/' + chapterId + '/book-info'); + getBookInfo(chapterId: number, includeWordCounts: boolean = false) { + return this.http.get(this.baseUrl + `book/${chapterId}/book-info?includeWordCounts=${includeWordCounts}`); } getBookPageUrl(chapterId: number, page: number) {