This repository has been archived on 2024-06-04. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
thesis-src/Examples/CLI/Processor/EvaluationProcessor.cs
T
Simon Gruber 8ada606fa6 a
2023-11-22 07:46:10 +01:00

112 lines
3.3 KiB
C#

using ImageMagick;
using Ocr.Tesseract;
using Ocr.Tesseract.Configuration;
using Ocr.Tesseract.Extensions;
using Ocr.Tesseract.Models;
using Ocr.Tesseract.Screenshots;
using Ocr.Tesseract.Screenshots.Configuration;
using Process.Abstract.Configuration;
using Process.Interface;
using System.Text.Json;
using System.Text.RegularExpressions;
namespace CLI.Processor;
internal class EvaluationProcessor
{
#region Configuration
/// <summary>
/// <see cref="Regex"/> expression for extracting whole words from scan results
/// </summary>
private static readonly Regex wordRegex = new(
@"[\w'\-]{2,}",
RegexOptions.Compiled | RegexOptions.Multiline | RegexOptions.IgnoreCase
);
private static readonly ITesseractConfiguration tesseractConfig =
new TesseractScreenshotConfiguration
{
DataPath = "tessdata",
Languages = new[] { "eng", "deu" }
};
#endregion
#region Processors
private static readonly IProcessorChain<ScanResult, ScanResult> postProcessor =
new ProcessorChainConfiguration<ScanResult, ScanResult>()
.Use(new ConfidenceFilter(50))
.Use(new ToLowerProcessor())
.Use(new DuplicateFilter())
.Complete(new RegexFilter(wordRegex));
private static readonly TesseractProcessor tesseractProcessor = new(tesseractConfig);
private readonly StopwatchProcessor<MagickImage, MagickImage> _thresholdProcessor;
#endregion
public string OutputFolder { get; init; } = "results";
public EvaluationProcessor(
IProcessor<MagickImage, MagickImage> thresholdProcessor
) => _thresholdProcessor = new StopwatchProcessor<MagickImage, MagickImage>(thresholdProcessor);
/// <inheritdoc />
public Task Process(MagickImage image) => Task.Run(async () =>
{
Directory.CreateDirectory(OutputFolder);
var words = MakeProcessor()
.Process(new[] { image })
.Select(r => r.Word)
.ToArray();
var result = new
{
Words = words.ToArray(),
Elapsed = _thresholdProcessor.Elapsed?.TotalMilliseconds,
};
var name = Path.GetFileNameWithoutExtension(image.FileName);
var path = Path.Combine(OutputFolder, $"{name}.{_thresholdProcessor}.json");
await using var file = File.OpenWrite(path);
await JsonSerializer.SerializeAsync(file, result);
});
private IProcessorChain<MagickImage, ScanResult> MakeProcessor()
{
var preprocessing = new ProcessorChainConfiguration<MagickImage, MagickImage>()
.Use(new CloneImageProcessor())
.Use(new ResizeProcessor(FilterType.Lanczos2Sharp, PixelInterpolateMethod.Mesh))
.Use(new NormalizeProcessor())
.Use(_thresholdProcessor)
.Use(new AddBorderProcessor(10))
.Use(new BinarizeProcessor())
.Use(new NegateCloneProcessor())
.Complete(OnPreprocessed);
return new ProcessorChainConfiguration<MagickImage, ScanResult>()
.Use(preprocessing)
.Use(tesseractProcessor)
.Complete(postProcessor);
}
private IEnumerable<MagickImage> OnPreprocessed(IEnumerable<MagickImage> images)
{
var tImages = images.ToArray();
for (var i = 0; i < tImages.Length; i++)
{
var image = tImages[i].CloneImage();
var name = Path.GetFileName(image.FileName);
var path = Path.Combine(OutputFolder, $"{_thresholdProcessor}.{i:D2}.{name}");
image.Write(path);
}
return tImages;
}
}