This repository has been archived on 2024-06-04. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
thesis-src/Examples/CLI/EvaluationProcessor.cs
T
Simon Gruber d8008e4f05 a
2023-11-20 07:43:12 +01:00

102 lines
3.2 KiB
C#

using ImageMagick;
using Ocr.Tesseract;
using Ocr.Tesseract.Configuration;
using Ocr.Tesseract.Extensions;
using Ocr.Tesseract.Models;
using Ocr.Tesseract.Screenshots;
using Ocr.Tesseract.Screenshots.Configuration;
using Process.Abstract.Configuration;
using Process.Interface;
using System.Text.Json;
using System.Text.RegularExpressions;
internal class EvaluationProcessor
{
/// <summary>
/// <see cref="Regex"/> expression for extracting whole words from scan results
/// </summary>
private static Regex WordRegex = new(
@"[\w'\-]{2,}",
RegexOptions.Compiled | RegexOptions.Multiline | RegexOptions.IgnoreCase
);
private static ITesseractConfiguration TesseractConfig = new TesseractScreenshotConfiguration()
{
DataPath = "tessdata",
Languages = new[] { "eng", "deu" }
};
private ScreenshotProcessorConfiguration ProcessorConfig = new ScreenshotProcessorConfiguration
{
Border = 0,
EnableResizing = false,
EnableThresholding = false,
FilterConnectedComponents = false,
ThresholdHeight = 0,
ThresholdWidth = 0
};
private static readonly TesseractProcessor tesseractProcessor = new(TesseractConfig);
private static readonly IProcessorChain<ScanResult, ScanResult> postProcessor =
new ProcessorChainConfiguration<ScanResult, ScanResult>()
.Use(new ConfidenceFilter(50))
.Use(new ToLowerProcessor())
.Use(new DuplicateFilter())
.Complete(new RegexFilter(WordRegex));
private readonly IProcessor<MagickImage, MagickImage> _thresholdProcessor;
public string OutputFolder { get; init; } = "results";
public EvaluationProcessor(IProcessor<MagickImage, MagickImage> thresholdProcessor)
{
_thresholdProcessor = thresholdProcessor;
}
/// <inheritdoc />
public Task Process(MagickImage image) => Task.Run(async () =>
{
Directory.CreateDirectory(OutputFolder);
var processor = MakeProcessor();
var results = processor.Process(new[] { image }).Select(r => r.Word);
var name = Path.GetFileNameWithoutExtension(image.FileName);
await using var file = File.OpenWrite(Path.Combine(OutputFolder, $"{name}.{_thresholdProcessor}.json"));
await JsonSerializer.SerializeAsync(file, results);
});
private IProcessorChain<MagickImage, ScanResult> MakeProcessor()
{
var preprocessing = new ProcessorChainConfiguration<MagickImage, MagickImage>()
.Use(new CloneImageProcessor())
.Use(new ResizeProcessor(FilterType.Lanczos2Sharp, PixelInterpolateMethod.Mesh))
.Use(new NormalizeProcessor())
.Use(_thresholdProcessor)
.Use(new AddBorderProcessor(10))
.Use(new BinarizeProcessor())
.Use(new NegateCloneProcessor())
.Complete(OnPreprocessed);
return new ProcessorChainConfiguration<MagickImage, ScanResult>()
.Use(preprocessing)
.Use(tesseractProcessor)
.Complete(postProcessor);
}
private IEnumerable<MagickImage> OnPreprocessed(IEnumerable<MagickImage> images)
{
var tImages = images.ToArray();
for (var i = 0; i < tImages.Length; i++)
{
var image = tImages[i].CloneImage();
var name = Path.GetFileName(image.FileName);
image.Write(Path.Combine(OutputFolder, $"{_thresholdProcessor}.{i:D2}.{name}"));
}
return tImages;
}
}