Word Count (#1286)

* Adding some code for Robbie

* See more on series detail metadata area is now at the bottom on the section

* Cleaned up subtitle headings to use a single class for offset with actionables

* Added some markup for the new design, waiting for Robbie to finish it off

* styling age-rating badge

* Started hooking up basic analyze file service and hooks in the UI. Basic code to implement the count is implemented and in benchmarks.

* Hooked up analyze ui to backend

* Refactored Series Detail metadata area to use a new icon/title design

* Cleaned up the new design

* Pushing for robbie to do css

* Massive performance improvement to scan series where we only need to scan folders reported that have series in them, rather than the whole library.

* Removed theme page as we no longer need it. Added WordCount to DTOs so the UI can show them. Added new pipe to format numbers in compact mode.

* Hooked up actual reading time based on user's words per hour

* Refactor some magic numbers to consts

* Hooked in progress reporting for series word count

* Hooked up analyze files

* Re-implemented time to read on comics

* Removed the word Last Read

* Show proper language name instead of iso tag on series detail page. Added some error handling on word count code.

* Reworked error handling

* Fixed some security vulnerabilities in npm.

* Handle a case where there are no text nodes and instead of returning an empty list, htmlagilitypack returns null.

* Tweaked the styles a bit on the icon-and-title

* Code cleanup

Co-authored-by: Robbie Davis <robbie@therobbiedavis.com>
This commit is contained in:
Joseph Milazzo 2022-05-25 16:53:39 -05:00 committed by GitHub
parent 0a70ac35dc
commit c1490d6e86
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
48 changed files with 2354 additions and 408 deletions

View file

@ -12,7 +12,9 @@ using API.Entities;
using API.Entities.Enums;
using API.Extensions;
using API.Helpers;
using API.Services.Tasks.Metadata;
using API.SignalR;
using Hangfire;
using Microsoft.AspNetCore.SignalR;
using Microsoft.Extensions.Logging;
@ -194,6 +196,8 @@ public class MetadataService : IMetadataService
/// <remarks>This can be heavy on memory first run</remarks>
/// <param name="libraryId"></param>
/// <param name="forceUpdate">Force updating cover image even if underlying file has not been modified or chapter already has a cover image</param>
[DisableConcurrentExecution(timeoutInSeconds: 360)]
[AutomaticRetry(Attempts = 0, OnAttemptsExceeded = AttemptsExceededAction.Delete)]
public async Task RefreshMetadata(int libraryId, bool forceUpdate = false)
{
var library = await _unitOfWork.LibraryRepository.GetLibraryForIdAsync(libraryId, LibraryIncludes.None);
@ -256,10 +260,10 @@ public class MetadataService : IMetadataService
await RemoveAbandonedMetadataKeys();
_logger.LogInformation("[MetadataService] Updated metadata for {SeriesNumber} series in library {LibraryName} in {ElapsedMilliseconds} milliseconds total", chunkInfo.TotalSize, library.Name, totalTime);
}
private async Task RemoveAbandonedMetadataKeys()
{
await _unitOfWork.TagRepository.RemoveAllTagNoLongerAssociated();

View file

@ -6,6 +6,7 @@ using API.Data;
using API.Entities.Enums;
using API.Helpers.Converters;
using API.Services.Tasks;
using API.Services.Tasks.Metadata;
using Hangfire;
using Hangfire.Storage;
using Microsoft.Extensions.Logging;
@ -22,6 +23,8 @@ public interface ITaskScheduler
void RefreshMetadata(int libraryId, bool forceUpdate = true);
void RefreshSeriesMetadata(int libraryId, int seriesId, bool forceUpdate = false);
void ScanSeries(int libraryId, int seriesId, bool forceUpdate = false);
void AnalyzeFilesForSeries(int libraryId, int seriesId, bool forceUpdate = false);
void AnalyzeFilesForLibrary(int libraryId, bool forceUpdate = false);
void CancelStatsTasks();
Task RunStatCollection();
void ScanSiteThemes();
@ -41,6 +44,7 @@ public class TaskScheduler : ITaskScheduler
private readonly IStatsService _statsService;
private readonly IVersionUpdaterService _versionUpdaterService;
private readonly IThemeService _themeService;
private readonly IWordCountAnalyzerService _wordCountAnalyzerService;
public static BackgroundJobServer Client => new BackgroundJobServer();
private static readonly Random Rnd = new Random();
@ -49,7 +53,7 @@ public class TaskScheduler : ITaskScheduler
public TaskScheduler(ICacheService cacheService, ILogger<TaskScheduler> logger, IScannerService scannerService,
IUnitOfWork unitOfWork, IMetadataService metadataService, IBackupService backupService,
ICleanupService cleanupService, IStatsService statsService, IVersionUpdaterService versionUpdaterService,
IThemeService themeService)
IThemeService themeService, IWordCountAnalyzerService wordCountAnalyzerService)
{
_cacheService = cacheService;
_logger = logger;
@ -61,6 +65,7 @@ public class TaskScheduler : ITaskScheduler
_statsService = statsService;
_versionUpdaterService = versionUpdaterService;
_themeService = themeService;
_wordCountAnalyzerService = wordCountAnalyzerService;
}
public async Task ScheduleTasks()
@ -111,6 +116,11 @@ public class TaskScheduler : ITaskScheduler
RecurringJob.AddOrUpdate("report-stats", () => _statsService.Send(), Cron.Daily(Rnd.Next(0, 22)), TimeZoneInfo.Local);
}
public void AnalyzeFilesForLibrary(int libraryId, bool forceUpdate = false)
{
BackgroundJob.Enqueue(() => _wordCountAnalyzerService.ScanLibrary(libraryId, forceUpdate));
}
public void CancelStatsTasks()
{
_logger.LogDebug("Cancelling/Removing StatsTasks");
@ -182,6 +192,12 @@ public class TaskScheduler : ITaskScheduler
BackgroundJob.Enqueue(() => _scannerService.ScanSeries(libraryId, seriesId, CancellationToken.None));
}
public void AnalyzeFilesForSeries(int libraryId, int seriesId, bool forceUpdate = false)
{
_logger.LogInformation("Enqueuing analyze files scan for: {SeriesId}", seriesId);
BackgroundJob.Enqueue(() => _wordCountAnalyzerService.ScanSeries(libraryId, seriesId, forceUpdate));
}
public void BackupDatabase()
{
BackgroundJob.Enqueue(() => _backupService.BackupDatabase());

View file

@ -0,0 +1,218 @@
using System;
using System.Diagnostics;
using System.Linq;
using System.Threading.Tasks;
using API.Data;
using API.Data.Repositories;
using API.Entities;
using API.Entities.Enums;
using API.Helpers;
using API.SignalR;
using Hangfire;
using HtmlAgilityPack;
using Microsoft.Extensions.Logging;
using VersOne.Epub;
namespace API.Services.Tasks.Metadata;
public interface IWordCountAnalyzerService
{
Task ScanLibrary(int libraryId, bool forceUpdate = false);
Task ScanSeries(int libraryId, int seriesId, bool forceUpdate = false);
}
/// <summary>
/// This service is a metadata task that generates information around time to read
/// </summary>
public class WordCountAnalyzerService : IWordCountAnalyzerService
{
private readonly ILogger<WordCountAnalyzerService> _logger;
private readonly IUnitOfWork _unitOfWork;
private readonly IEventHub _eventHub;
private readonly ICacheHelper _cacheHelper;
public WordCountAnalyzerService(ILogger<WordCountAnalyzerService> logger, IUnitOfWork unitOfWork, IEventHub eventHub,
ICacheHelper cacheHelper)
{
_logger = logger;
_unitOfWork = unitOfWork;
_eventHub = eventHub;
_cacheHelper = cacheHelper;
}
[DisableConcurrentExecution(timeoutInSeconds: 360)]
[AutomaticRetry(Attempts = 0, OnAttemptsExceeded = AttemptsExceededAction.Delete)]
public async Task ScanLibrary(int libraryId, bool forceUpdate = false)
{
var sw = Stopwatch.StartNew();
var library = await _unitOfWork.LibraryRepository.GetLibraryForIdAsync(libraryId, LibraryIncludes.None);
await _eventHub.SendMessageAsync(MessageFactory.NotificationProgress,
MessageFactory.WordCountAnalyzerProgressEvent(libraryId, 0F, ProgressEventType.Started, string.Empty));
var chunkInfo = await _unitOfWork.SeriesRepository.GetChunkInfo(library.Id);
var stopwatch = Stopwatch.StartNew();
var totalTime = 0L;
_logger.LogInformation("[MetadataService] Refreshing Library {LibraryName}. Total Items: {TotalSize}. Total Chunks: {TotalChunks} with {ChunkSize} size", library.Name, chunkInfo.TotalSize, chunkInfo.TotalChunks, chunkInfo.ChunkSize);
await _eventHub.SendMessageAsync(MessageFactory.NotificationProgress,
MessageFactory.WordCountAnalyzerProgressEvent(library.Id, 0F, ProgressEventType.Started, $"Starting {library.Name}"));
for (var chunk = 1; chunk <= chunkInfo.TotalChunks; chunk++)
{
if (chunkInfo.TotalChunks == 0) continue;
totalTime += stopwatch.ElapsedMilliseconds;
stopwatch.Restart();
_logger.LogInformation("[MetadataService] Processing chunk {ChunkNumber} / {TotalChunks} with size {ChunkSize}. Series ({SeriesStart} - {SeriesEnd}",
chunk, chunkInfo.TotalChunks, chunkInfo.ChunkSize, chunk * chunkInfo.ChunkSize, (chunk + 1) * chunkInfo.ChunkSize);
var nonLibrarySeries = await _unitOfWork.SeriesRepository.GetFullSeriesForLibraryIdAsync(library.Id,
new UserParams()
{
PageNumber = chunk,
PageSize = chunkInfo.ChunkSize
});
_logger.LogDebug("[MetadataService] Fetched {SeriesCount} series for refresh", nonLibrarySeries.Count);
var seriesIndex = 0;
foreach (var series in nonLibrarySeries)
{
var index = chunk * seriesIndex;
var progress = Math.Max(0F, Math.Min(1F, index * 1F / chunkInfo.TotalSize));
await _eventHub.SendMessageAsync(MessageFactory.NotificationProgress,
MessageFactory.WordCountAnalyzerProgressEvent(library.Id, progress, ProgressEventType.Updated, series.Name));
try
{
await ProcessSeries(series, forceUpdate, false);
}
catch (Exception ex)
{
_logger.LogError(ex, "[MetadataService] There was an exception during metadata refresh for {SeriesName}", series.Name);
}
seriesIndex++;
}
if (_unitOfWork.HasChanges())
{
await _unitOfWork.CommitAsync();
}
_logger.LogInformation(
"[MetadataService] Processed {SeriesStart} - {SeriesEnd} out of {TotalSeries} series in {ElapsedScanTime} milliseconds for {LibraryName}",
chunk * chunkInfo.ChunkSize, (chunk * chunkInfo.ChunkSize) + nonLibrarySeries.Count, chunkInfo.TotalSize, stopwatch.ElapsedMilliseconds, library.Name);
}
await _eventHub.SendMessageAsync(MessageFactory.NotificationProgress,
MessageFactory.WordCountAnalyzerProgressEvent(library.Id, 1F, ProgressEventType.Ended, $"Complete"));
_logger.LogInformation("[WordCountAnalyzerService] Updated metadata for {LibraryName} in {ElapsedMilliseconds} milliseconds", library.Name, sw.ElapsedMilliseconds);
}
public async Task ScanSeries(int libraryId, int seriesId, bool forceUpdate = false)
{
var sw = Stopwatch.StartNew();
var series = await _unitOfWork.SeriesRepository.GetFullSeriesForSeriesIdAsync(seriesId);
if (series == null)
{
_logger.LogError("[WordCountAnalyzerService] Series {SeriesId} was not found on Library {LibraryId}", seriesId, libraryId);
return;
}
await _eventHub.SendMessageAsync(MessageFactory.NotificationProgress,
MessageFactory.WordCountAnalyzerProgressEvent(libraryId, 0F, ProgressEventType.Started, series.Name));
await ProcessSeries(series);
if (_unitOfWork.HasChanges())
{
await _unitOfWork.CommitAsync();
}
await _eventHub.SendMessageAsync(MessageFactory.NotificationProgress,
MessageFactory.WordCountAnalyzerProgressEvent(libraryId, 1F, ProgressEventType.Ended, series.Name));
_logger.LogInformation("[WordCountAnalyzerService] Updated metadata for {SeriesName} in {ElapsedMilliseconds} milliseconds", series.Name, sw.ElapsedMilliseconds);
}
private async Task ProcessSeries(Series series, bool forceUpdate = false, bool useFileName = true)
{
if (series.Format != MangaFormat.Epub) return;
long totalSum = 0;
foreach (var chapter in series.Volumes.SelectMany(v => v.Chapters))
{
// This compares if it's changed since a file scan only
if (!_cacheHelper.HasFileNotChangedSinceCreationOrLastScan(chapter, false,
chapter.Files.FirstOrDefault()) && chapter.WordCount != 0)
continue;
long sum = 0;
var fileCounter = 1;
foreach (var file in chapter.Files.Select(file => file.FilePath))
{
var pageCounter = 1;
try
{
using var book = await EpubReader.OpenBookAsync(file, BookService.BookReaderOptions);
var totalPages = book.Content.Html.Values;
foreach (var bookPage in totalPages)
{
var progress = Math.Max(0F,
Math.Min(1F, (fileCounter * pageCounter) * 1F / (chapter.Files.Count * totalPages.Count)));
await _eventHub.SendMessageAsync(MessageFactory.NotificationProgress,
MessageFactory.WordCountAnalyzerProgressEvent(series.LibraryId, progress,
ProgressEventType.Updated, useFileName ? file : series.Name));
sum += await GetWordCountFromHtml(bookPage);
pageCounter++;
}
fileCounter++;
}
catch (Exception ex)
{
_logger.LogError(ex, "There was an error reading an epub file for word count, series skipped");
await _eventHub.SendMessageAsync(MessageFactory.Error,
MessageFactory.ErrorEvent("There was an issue counting words on an epub",
$"{series.Name} - {file}"));
return;
}
}
chapter.WordCount = sum;
_unitOfWork.ChapterRepository.Update(chapter);
totalSum += sum;
}
series.WordCount = totalSum;
_unitOfWork.SeriesRepository.Update(series);
}
private static async Task<int> GetWordCountFromHtml(EpubContentFileRef bookFile)
{
var doc = new HtmlDocument();
doc.LoadHtml(await bookFile.ReadContentAsTextAsync());
var textNodes = doc.DocumentNode.SelectNodes("//body//text()[not(parent::script)]");
if (textNodes == null) return 0;
return textNodes
.Select(node => node.InnerText)
.Select(text => text.Split(' ', StringSplitOptions.RemoveEmptyEntries)
.Where(s => char.IsLetter(s[0])))
.Select(words => words.Count())
.Where(wordCount => wordCount > 0)
.Sum();
}
}

View file

@ -14,6 +14,7 @@ using API.Entities.Enums;
using API.Extensions;
using API.Helpers;
using API.Parser;
using API.Services.Tasks.Metadata;
using API.Services.Tasks.Scanner;
using API.SignalR;
using Hangfire;
@ -43,11 +44,12 @@ public class ScannerService : IScannerService
private readonly IDirectoryService _directoryService;
private readonly IReadingItemService _readingItemService;
private readonly ICacheHelper _cacheHelper;
private readonly IWordCountAnalyzerService _wordCountAnalyzerService;
public ScannerService(IUnitOfWork unitOfWork, ILogger<ScannerService> logger,
IMetadataService metadataService, ICacheService cacheService, IEventHub eventHub,
IFileService fileService, IDirectoryService directoryService, IReadingItemService readingItemService,
ICacheHelper cacheHelper)
ICacheHelper cacheHelper, IWordCountAnalyzerService wordCountAnalyzerService)
{
_unitOfWork = unitOfWork;
_logger = logger;
@ -58,6 +60,7 @@ public class ScannerService : IScannerService
_directoryService = directoryService;
_readingItemService = readingItemService;
_cacheHelper = cacheHelper;
_wordCountAnalyzerService = wordCountAnalyzerService;
}
[DisableConcurrentExecution(timeoutInSeconds: 360)]
@ -71,6 +74,15 @@ public class ScannerService : IScannerService
var library = await _unitOfWork.LibraryRepository.GetLibraryForIdAsync(libraryId, LibraryIncludes.Folders);
var folderPaths = library.Folders.Select(f => f.Path).ToList();
var seriesFolderPaths = (await _unitOfWork.SeriesRepository.GetFilesForSeries(seriesId))
.Select(f => _directoryService.FileSystem.FileInfo.FromFileName(f.FilePath).Directory.FullName)
.ToList();
if (!await CheckMounts(library.Name, seriesFolderPaths))
{
_logger.LogCritical("Some of the root folders for library are not accessible. Please check that drives are connected and rescan. Scan will be aborted");
return;
}
if (!await CheckMounts(library.Name, library.Folders.Select(f => f.Path).ToList()))
{
@ -82,10 +94,15 @@ public class ScannerService : IScannerService
var allGenres = await _unitOfWork.GenreRepository.GetAllGenresAsync();
var allTags = await _unitOfWork.TagRepository.GetAllTagsAsync();
var dirs = _directoryService.FindHighestDirectoriesFromFiles(folderPaths, files.Select(f => f.FilePath).ToList());
var seriesDirs = _directoryService.FindHighestDirectoriesFromFiles(seriesFolderPaths, files.Select(f => f.FilePath).ToList());
if (seriesDirs.Keys.Count == 0)
{
_logger.LogDebug("Scan Series has files spread outside a main series folder. Defaulting to library folder");
seriesDirs = _directoryService.FindHighestDirectoriesFromFiles(folderPaths, files.Select(f => f.FilePath).ToList());
}
_logger.LogInformation("Beginning file scan on {SeriesName}", series.Name);
var (totalFiles, scanElapsedTime, parsedSeries) = await ScanFiles(library, dirs.Keys);
var (totalFiles, scanElapsedTime, parsedSeries) = await ScanFiles(library, seriesDirs.Keys);
@ -117,10 +134,10 @@ public class ScannerService : IScannerService
// We need to do an additional check for an edge case: If the scan ran and the files do not match the existing Series name, then it is very likely,
// the files have crap naming and if we don't correct, the series will get deleted due to the parser not being able to fallback onto folder parsing as the root
// is the series folder.
var existingFolder = dirs.Keys.FirstOrDefault(key => key.Contains(series.OriginalName));
if (dirs.Keys.Count == 1 && !string.IsNullOrEmpty(existingFolder))
var existingFolder = seriesDirs.Keys.FirstOrDefault(key => key.Contains(series.OriginalName));
if (seriesDirs.Keys.Count == 1 && !string.IsNullOrEmpty(existingFolder))
{
dirs = new Dictionary<string, string>();
seriesDirs = new Dictionary<string, string>();
var path = Directory.GetParent(existingFolder)?.FullName;
if (!folderPaths.Contains(path) || !folderPaths.Any(p => p.Contains(path ?? string.Empty)))
{
@ -131,11 +148,11 @@ public class ScannerService : IScannerService
}
if (!string.IsNullOrEmpty(path))
{
dirs[path] = string.Empty;
seriesDirs[path] = string.Empty;
}
}
var (totalFiles2, scanElapsedTime2, parsedSeries2) = await ScanFiles(library, dirs.Keys);
var (totalFiles2, scanElapsedTime2, parsedSeries2) = await ScanFiles(library, seriesDirs.Keys);
_logger.LogInformation("{SeriesName} has bad naming convention, forcing rescan at a higher directory", series.OriginalName);
totalFiles += totalFiles2;
scanElapsedTime += scanElapsedTime2;
@ -303,10 +320,8 @@ public class ScannerService : IScannerService
await CleanupDbEntities();
// await _eventHub.SendMessageAsync(SignalREvents.NotificationProgress,
// MessageFactory.ScanLibraryProgressEvent(libraryId, 1F));
BackgroundJob.Enqueue(() => _metadataService.RefreshMetadata(libraryId, false));
BackgroundJob.Enqueue(() => _wordCountAnalyzerService.ScanLibrary(libraryId, false));
}
private async Task<Tuple<int, long, Dictionary<ParsedSeries, List<ParserInfo>>>> ScanFiles(Library library, IEnumerable<string> dirs)