diff --git a/API/Controllers/BookController.cs b/API/Controllers/BookController.cs
index 453a4ef65..e727d8c1a 100644
--- a/API/Controllers/BookController.cs
+++ b/API/Controllers/BookController.cs
@@ -2,6 +2,7 @@
using System.IO;
using System.Linq;
using System.Threading.Tasks;
+using API.Constants;
using API.Data;
using API.DTOs.Reader;
using API.Entities.Enums;
@@ -40,11 +41,14 @@ public class BookController : BaseApiController
///
///
[HttpGet("{chapterId}/book-info")]
- public async Task> GetBookInfo(int chapterId)
+ [ResponseCache(CacheProfileName = ResponseCacheProfiles.Hour, VaryByQueryKeys = ["chapterId", "includeWordCounts"])]
+ public async Task> GetBookInfo(int chapterId, bool includeWordCounts = false)
{
var dto = await _unitOfWork.ChapterRepository.GetChapterInfoDtoAsync(chapterId);
if (dto == null) return BadRequest(await _localizationService.Translate(User.GetUserId(), "chapter-doesnt-exist"));
var bookTitle = string.Empty;
+ IDictionary? pageWordCounts = null;
+
switch (dto.SeriesFormat)
{
case MangaFormat.Epub:
@@ -52,6 +56,12 @@ public class BookController : BaseApiController
var mangaFile = (await _unitOfWork.ChapterRepository.GetFilesForChapterAsync(chapterId))[0];
using var book = await EpubReader.OpenBookAsync(mangaFile.FilePath, BookService.LenientBookReaderOptions);
bookTitle = book.Title;
+
+ if (includeWordCounts)
+ {
+ // TODO: Cache this in temp/chapterId folder to avoid having to process file each time
+ pageWordCounts = await _bookService.GetWordCountsPerPage(mangaFile.FilePath);
+ }
break;
}
case MangaFormat.Pdf:
@@ -72,9 +82,9 @@ public class BookController : BaseApiController
break;
}
- return Ok(new BookInfoDto()
+ var info = new BookInfoDto()
{
- ChapterNumber = dto.ChapterNumber,
+ ChapterNumber = dto.ChapterNumber,
VolumeNumber = dto.VolumeNumber,
VolumeId = dto.VolumeId,
BookTitle = bookTitle,
@@ -84,7 +94,14 @@ public class BookController : BaseApiController
LibraryId = dto.LibraryId,
IsSpecial = dto.IsSpecial,
Pages = dto.Pages,
- });
+ PageWordCounts = pageWordCounts
+ };
+
+
+
+
+
+ return Ok(info);
}
///
diff --git a/API/Controllers/ReaderController.cs b/API/Controllers/ReaderController.cs
index 3cc9f5285..1071ba46c 100644
--- a/API/Controllers/ReaderController.cs
+++ b/API/Controllers/ReaderController.cs
@@ -41,6 +41,7 @@ public class ReaderController : BaseApiController
private readonly IEventHub _eventHub;
private readonly IScrobblingService _scrobblingService;
private readonly ILocalizationService _localizationService;
+ private readonly IBookService _bookService;
///
public ReaderController(ICacheService cacheService,
@@ -48,7 +49,8 @@ public class ReaderController : BaseApiController
IReaderService readerService, IBookmarkService bookmarkService,
IAccountService accountService, IEventHub eventHub,
IScrobblingService scrobblingService,
- ILocalizationService localizationService)
+ ILocalizationService localizationService,
+ IBookService bookService)
{
_cacheService = cacheService;
_unitOfWork = unitOfWork;
@@ -59,6 +61,7 @@ public class ReaderController : BaseApiController
_eventHub = eventHub;
_scrobblingService = scrobblingService;
_localizationService = localizationService;
+ _bookService = bookService;
}
///
@@ -218,11 +221,11 @@ public class ReaderController : BaseApiController
/// This is generally the first call when attempting to read to allow pre-generation of assets needed for reading
///
/// Should Kavita extract pdf into images. Defaults to false.
- /// Include file dimensions. Only useful for image based reading
+ /// Include file dimensions. Only useful for image-based reading
+ /// Include epub word counts per page. Only useful for epub-based reading
///
[HttpGet("chapter-info")]
- [ResponseCache(CacheProfileName = ResponseCacheProfiles.Hour, VaryByQueryKeys = ["chapterId", "extractPdf", "includeDimensions"
- ])]
+ [ResponseCache(CacheProfileName = ResponseCacheProfiles.Hour, VaryByQueryKeys = ["chapterId", "extractPdf", "includeDimensions"])]
public async Task> GetChapterInfo(int chapterId, bool extractPdf = false, bool includeDimensions = false)
{
if (chapterId <= 0) return Ok(null); // This can happen occasionally from UI, we should just ignore
@@ -846,6 +849,7 @@ public class ReaderController : BaseApiController
// Patch in the reading progress
await _unitOfWork.ChapterRepository.AddChapterModifiers(User.GetUserId(), chapter);
+ // TODO: We need to actually use word count from the pages
if (series.Format == MangaFormat.Epub)
{
var progressCount = chapter.WordCount;
diff --git a/API/DTOs/Reader/BookInfoDto.cs b/API/DTOs/Reader/BookInfoDto.cs
index 2473cd5dc..7f56ca160 100644
--- a/API/DTOs/Reader/BookInfoDto.cs
+++ b/API/DTOs/Reader/BookInfoDto.cs
@@ -1,4 +1,5 @@
-using API.Entities.Enums;
+using System.Collections.Generic;
+using API.Entities.Enums;
namespace API.DTOs.Reader;
@@ -15,4 +16,9 @@ public sealed record BookInfoDto : IChapterInfoDto
public int Pages { get; set; }
public bool IsSpecial { get; set; }
public string ChapterTitle { get; set; } = default! ;
+ ///
+ /// For Epub reader, this will contain Page number -> word count. All other times will be null.
+ ///
+ /// This is optionally returned by includeWordCounts
+ public IDictionary? PageWordCounts { get; set; }
}
diff --git a/API/Services/BookService.cs b/API/Services/BookService.cs
index f4d07685b..f51532671 100644
--- a/API/Services/BookService.cs
+++ b/API/Services/BookService.cs
@@ -14,6 +14,7 @@ using API.Entities.Enums;
using API.Extensions;
using API.Services.Tasks.Scanner.Parser;
using API.Helpers;
+using API.Services.Tasks.Metadata;
using Docnet.Core;
using Docnet.Core.Converters;
using Docnet.Core.Models;
@@ -59,6 +60,7 @@ public interface IBookService
Task> GenerateTableOfContents(Chapter chapter);
Task GetBookPage(int page, int chapterId, string cachedEpubPath, string baseUrl, List ptocBookmarks, List annotations);
Task> CreateKeyToPageMappingAsync(EpubBookRef book);
+ Task?> GetWordCountsPerPage(string bookFilePath);
}
public class BookService : IBookService
@@ -955,6 +957,50 @@ public class BookService : IBookService
return dict;
}
+ public async Task?> GetWordCountsPerPage(string bookFilePath)
+ {
+ var ret = new Dictionary();
+ try
+ {
+ using var book = await EpubReader.OpenBookAsync(bookFilePath, LenientBookReaderOptions);
+ var mappings = await CreateKeyToPageMappingAsync(book);
+
+ var doc = new HtmlDocument {OptionFixNestedTags = true};
+
+
+ var bookPages = await book.GetReadingOrderAsync();
+ foreach (var contentFileRef in bookPages)
+ {
+ var page = mappings[contentFileRef.Key];
+ var content = await contentFileRef.ReadContentAsync();
+ doc.LoadHtml(content);
+
+ var body = doc.DocumentNode.SelectSingleNode("//body");
+
+ if (body == null)
+ {
+ _logger.LogError("{FilePath} has no body tag! Generating one for support. Book may be skewed", book.FilePath);
+ doc.DocumentNode.SelectSingleNode("/html").AppendChild(HtmlNode.CreateNode(""));
+ body = doc.DocumentNode.SelectSingleNode("//html/body");
+ }
+
+ // Find all words in the html body
+ // TEMP: REfactor this to use WordCountAnalyzerService
+ var textNodes = body!.SelectNodes("//text()[not(parent::script)]");
+ ret.Add(page, textNodes?.Sum(node => node.InnerText.Count(char.IsLetter)) ?? 0);
+
+ }
+
+ }
+ catch (Exception ex)
+ {
+ _logger.LogError(ex, "There was an issue calculating word counts per page");
+ return null;
+ }
+
+ return ret;
+ }
+
///
/// Parses out Title from book. Chapters and Volumes will always be "0". If there is any exception reading book (malformed books)
/// then null is returned. This expects only an epub file
diff --git a/API/Services/Tasks/Metadata/WordCountAnalyzerService.cs b/API/Services/Tasks/Metadata/WordCountAnalyzerService.cs
index bff7001bd..a1c1e4e02 100644
--- a/API/Services/Tasks/Metadata/WordCountAnalyzerService.cs
+++ b/API/Services/Tasks/Metadata/WordCountAnalyzerService.cs
@@ -35,7 +35,7 @@ public class WordCountAnalyzerService : IWordCountAnalyzerService
private readonly IReaderService _readerService;
private readonly IMediaErrorService _mediaErrorService;
- private const int AverageCharactersPerWord = 5;
+ public const int AverageCharactersPerWord = 5;
public WordCountAnalyzerService(ILogger logger, IUnitOfWork unitOfWork, IEventHub eventHub,
ICacheHelper cacheHelper, IReaderService readerService, IMediaErrorService mediaErrorService)
diff --git a/UI/Web/src/app/book-reader/_components/book-reader/book-reader.component.ts b/UI/Web/src/app/book-reader/_components/book-reader/book-reader.component.ts
index 057e9d848..c33768b82 100644
--- a/UI/Web/src/app/book-reader/_components/book-reader/book-reader.component.ts
+++ b/UI/Web/src/app/book-reader/_components/book-reader/book-reader.component.ts
@@ -662,7 +662,7 @@ export class BookReaderComponent implements OnInit, AfterViewInit, OnDestroy {
this.cdRef.markForCheck();
- this.bookService.getBookInfo(this.chapterId).subscribe(async (info) => {
+ this.bookService.getBookInfo(this.chapterId, true).subscribe(async (info) => {
if (this.readingListMode && info.seriesFormat !== MangaFormat.EPUB) {
// Redirect to the manga reader.
const params = this.readerService.getQueryParamsObject(this.incognitoMode, this.readingListMode, this.readingListId);
diff --git a/UI/Web/src/app/book-reader/_models/book-info.ts b/UI/Web/src/app/book-reader/_models/book-info.ts
index 4816bd324..e564bf84d 100644
--- a/UI/Web/src/app/book-reader/_models/book-info.ts
+++ b/UI/Web/src/app/book-reader/_models/book-info.ts
@@ -1,9 +1,13 @@
-import { MangaFormat } from "src/app/_models/manga-format";
+import {MangaFormat} from "src/app/_models/manga-format";
export interface BookInfo {
- bookTitle: string;
- seriesFormat: MangaFormat;
- seriesId: number;
- libraryId: number;
- volumeId: number;
-}
\ No newline at end of file
+ bookTitle: string;
+ seriesFormat: MangaFormat;
+ seriesId: number;
+ libraryId: number;
+ volumeId: number;
+ /**
+ * Maps the page number to character count. Only available on epub reader.
+ */
+ pageWordCounts: {[key: number]: number};
+}
diff --git a/UI/Web/src/app/book-reader/_services/book.service.ts b/UI/Web/src/app/book-reader/_services/book.service.ts
index d98f09f38..5dd433482 100644
--- a/UI/Web/src/app/book-reader/_services/book.service.ts
+++ b/UI/Web/src/app/book-reader/_services/book.service.ts
@@ -1,9 +1,9 @@
-import { HttpClient } from '@angular/common/http';
-import { Injectable } from '@angular/core';
-import { TextResonse } from 'src/app/_types/text-response';
-import { environment } from 'src/environments/environment';
-import { BookChapterItem } from '../_models/book-chapter-item';
-import { BookInfo } from '../_models/book-info';
+import {HttpClient} from '@angular/common/http';
+import {Injectable} from '@angular/core';
+import {TextResonse} from 'src/app/_types/text-response';
+import {environment} from 'src/environments/environment';
+import {BookChapterItem} from '../_models/book-chapter-item';
+import {BookInfo} from '../_models/book-info';
export interface FontFamily {
/**
@@ -28,7 +28,8 @@ export class BookService {
getFontFamilies(): Array {
return [{title: 'default', family: 'default'}, {title: 'EBGaramond', family: 'EBGaramond'}, {title: 'Fira Sans', family: 'Fira_Sans'},
{title: 'Lato', family: 'Lato'}, {title: 'Libre Baskerville', family: 'Libre_Baskerville'}, {title: 'Merriweather', family: 'Merriweather'},
- {title: 'Nanum Gothic', family: 'Nanum_Gothic'}, {title: 'Open Dyslexic', family: 'OpenDyslexic2'}, {title: 'RocknRoll One', family: 'RocknRoll_One'}, {title: 'Fast Font Serif (Bionic)', family: 'FastFontSerif'}, {title: 'Fast Font Sans (Bionic)', family: 'FastFontSans'}];
+ {title: 'Nanum Gothic', family: 'Nanum_Gothic'}, {title: 'Open Dyslexic', family: 'OpenDyslexic2'}, {title: 'RocknRoll One', family: 'RocknRoll_One'},
+ {title: 'Fast Font Serif (Bionic)', family: 'FastFontSerif'}, {title: 'Fast Font Sans (Bionic)', family: 'FastFontSans'}];
}
getBookChapters(chapterId: number) {
@@ -39,8 +40,8 @@ export class BookService {
return this.http.get(this.baseUrl + 'book/' + chapterId + '/book-page?page=' + page, TextResonse);
}
- getBookInfo(chapterId: number) {
- return this.http.get(this.baseUrl + 'book/' + chapterId + '/book-info');
+ getBookInfo(chapterId: number, includeWordCounts: boolean = false) {
+ return this.http.get(this.baseUrl + `book/${chapterId}/book-info?includeWordCounts=${includeWordCounts}`);
}
getBookPageUrl(chapterId: number, page: number) {