110 lines
3.3 KiB
C#
110 lines
3.3 KiB
C#
using ImageMagick;
|
|
using Ocr.Tesseract;
|
|
using Ocr.Tesseract.Configuration;
|
|
using Ocr.Tesseract.Extensions;
|
|
using Ocr.Tesseract.Models;
|
|
using Ocr.Tesseract.Screenshots;
|
|
using Ocr.Tesseract.Screenshots.Configuration;
|
|
using Process.Abstract.Configuration;
|
|
using Process.Interface;
|
|
using System.Text.Json;
|
|
using System.Text.RegularExpressions;
|
|
|
|
namespace Ocr.Cli.Processor;
|
|
|
|
internal class EvaluationProcessor
|
|
{
|
|
#region Configuration
|
|
|
|
/// <summary>
|
|
/// <see cref="Regex"/> expression for extracting whole words from scan results
|
|
/// </summary>
|
|
private static readonly Regex wordRegex = new(
|
|
@"[\w'\-äöüÄÖÜß]{2,}",
|
|
RegexOptions.Compiled | RegexOptions.Multiline | RegexOptions.IgnoreCase
|
|
);
|
|
|
|
private static readonly ITesseractConfiguration tesseractConfig =
|
|
new TesseractScreenshotConfiguration
|
|
{
|
|
DataPath = "tessdata",
|
|
Languages = new[] { "eng", "deu" }
|
|
};
|
|
|
|
#endregion
|
|
|
|
#region Processors
|
|
|
|
private static readonly IProcessorChain<ScanResult, ScanResult> postProcessor =
|
|
new ProcessorChainConfiguration<ScanResult, ScanResult>()
|
|
.Use(new ConfidenceFilter(50))
|
|
.Use(new ToLowerProcessor())
|
|
.Use(new DuplicateFilter())
|
|
.Complete(new RegexFilter(wordRegex));
|
|
|
|
private IProcessorChain<MagickImage, ScanResult> MakeProcessor()
|
|
{
|
|
var preprocessing = new ProcessorChainConfiguration<MagickImage, MagickImage>()
|
|
.Use(new CloneImageProcessor())
|
|
.Use(new ResizeProcessor(FilterType.Lanczos2Sharp, PixelInterpolateMethod.Mesh))
|
|
.Use(new NormalizeProcessor())
|
|
.Use(_thresholdProcessor)
|
|
.Use(new AddBorderProcessor(10))
|
|
.Use(new BinarizeProcessor())
|
|
.Use(new NegateCloneProcessor())
|
|
.Complete(OnPreprocessed);
|
|
|
|
return new ProcessorChainConfiguration<MagickImage, ScanResult>()
|
|
.Use(preprocessing)
|
|
.Use(tesseractProcessor)
|
|
.Complete(postProcessor);
|
|
}
|
|
|
|
private static readonly TesseractProcessor tesseractProcessor = new(tesseractConfig);
|
|
|
|
private readonly StopwatchProcessor<MagickImage, MagickImage> _thresholdProcessor;
|
|
|
|
#endregion
|
|
|
|
|
|
public string OutputFolder { get; init; } = "results";
|
|
|
|
public EvaluationProcessor(
|
|
IProcessor<MagickImage, MagickImage> thresholdProcessor
|
|
) => _thresholdProcessor = new StopwatchProcessor<MagickImage, MagickImage>(thresholdProcessor);
|
|
|
|
/// <inheritdoc />
|
|
public Task Process(MagickImage image) => Task.Run(async () =>
|
|
{
|
|
var words = MakeProcessor()
|
|
.Process(new[] { image })
|
|
.Select(r => r.Word)
|
|
.ToArray();
|
|
|
|
var result = new
|
|
{
|
|
Words = words.ToArray(),
|
|
Elapsed = _thresholdProcessor.Elapsed?.TotalMilliseconds,
|
|
};
|
|
|
|
var name = Path.GetFileNameWithoutExtension(image.FileName);
|
|
var path = Path.Combine(OutputFolder, $"{name}.{_thresholdProcessor}.json");
|
|
await using var file = File.OpenWrite(path);
|
|
await JsonSerializer.SerializeAsync(file, result);
|
|
});
|
|
|
|
private IEnumerable<MagickImage> OnPreprocessed(IEnumerable<MagickImage> images)
|
|
{
|
|
var tImages = images.ToArray();
|
|
|
|
for (var i = 0; i < tImages.Length; i++)
|
|
{
|
|
var image = tImages[i].CloneImage();
|
|
var name = Path.GetFileName(image.FileName);
|
|
var path = Path.Combine(OutputFolder, $"{_thresholdProcessor}.{i:D2}.{name}");
|
|
image.Write(path);
|
|
}
|
|
|
|
return tImages;
|
|
}
|
|
} |