115 lines
3.8 KiB
C#
115 lines
3.8 KiB
C#
using ImageMagick;
|
|
using Lookup.Memory;
|
|
using Ocr.Tesseract;
|
|
using Ocr.Tesseract.Configuration;
|
|
using Ocr.Tesseract.Models;
|
|
using Ocr.Tesseract.Screenshots;
|
|
using Ocr.Tesseract.Screenshots.Configuration;
|
|
using Ocr.Tesseract.Screenshots.Threshold;
|
|
using Process.Abstract.Configuration;
|
|
using Process.Interface;
|
|
using System.Text.RegularExpressions;
|
|
|
|
namespace Common
|
|
{
|
|
/// <summary>
|
|
/// Scanner class, scanning <see cref="MagickImage"/>s for <see cref="Word"/>s
|
|
/// via optical character recognition. Optimized for digital Screenshots.
|
|
/// </summary>
|
|
public class ScreenshotScanner
|
|
{
|
|
private readonly IProcessor<MagickImage, ScanResult> _processor;
|
|
|
|
/// <summary>
|
|
/// <see cref="Regex"/> expression for extracting whole words from scan results
|
|
/// </summary>
|
|
private static readonly Regex wordRegex = new(
|
|
@"[\w'\-]{2,}",
|
|
RegexOptions.Compiled | RegexOptions.Multiline | RegexOptions.IgnoreCase
|
|
);
|
|
|
|
/// <summary>
|
|
/// Data storage
|
|
/// </summary>
|
|
public Lookup.Interface.ILookup<Word, MagickImage> Lookup { get; } =
|
|
new MemoryLookup<Word, MagickImage>();
|
|
|
|
/// <summary>
|
|
/// Configuration of the <see cref="ImageProcessor"/>
|
|
/// </summary>
|
|
public ScreenshotProcessorConfiguration ImageProcessorConfiguration { get; }
|
|
|
|
public ITesseractConfiguration TesseractConfiguration { get; }
|
|
|
|
/// <summary>
|
|
/// Constructor
|
|
/// </summary>
|
|
public ScreenshotScanner(
|
|
ScreenshotProcessorConfiguration imageProcessorConfig,
|
|
ITesseractConfiguration tesseractConfig
|
|
)
|
|
{
|
|
ImageProcessorConfiguration = imageProcessorConfig;
|
|
TesseractConfiguration = tesseractConfig;
|
|
|
|
_processor = MakeProcessor();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Process the provided <paramref name="images"/> and add the results to
|
|
/// the <see cref="Lookup"/>
|
|
/// </summary>
|
|
/// <param name="images">The <see cref="MagickImage"/>s to process</param>
|
|
public void Process(IEnumerable<MagickImage> images)
|
|
{
|
|
foreach (var kv in _processor.Process(images))
|
|
{
|
|
Lookup.Add(kv.Word, kv.Image);
|
|
}
|
|
}
|
|
|
|
private IProcessor<MagickImage, ScanResult> MakeProcessor()
|
|
{
|
|
var threshold =
|
|
new ThresholdAdaptiveProcessor(ImageProcessorConfiguration.ThresholdWidth,
|
|
ImageProcessorConfiguration.ThresholdHeight);
|
|
// var threshold = new AutoThresholdProcessor(AutoThresholdMethod.Kapur);
|
|
// var threshold = new AutoThresholdProcessor(AutoThresholdMethod.OTSU);
|
|
// var threshold = new AutoThresholdProcessor(AutoThresholdMethod.Triangle);
|
|
// var threshold = new ThresholdProcessor(60);
|
|
|
|
var preprocessing = new ProcessorChainConfiguration<MagickImage, MagickImage>()
|
|
.Use(new CloneImageProcessor())
|
|
.Use(new ResizeProcessor(FilterType.Lanczos2Sharp, PixelInterpolateMethod.Mesh))
|
|
.Use(new NormalizeProcessor())
|
|
.Use(threshold)
|
|
.Use(new AddBorderProcessor(10))
|
|
.Use(new BinarizeProcessor())
|
|
.Complete(new NegateCloneProcessor());
|
|
|
|
var postprocessing = new ProcessorChainConfiguration<ScanResult, ScanResult>()
|
|
.Use(new ConfidenceFilter(50))
|
|
.Use(new ToLowerProcessor())
|
|
.Use(new DuplicateFilter())
|
|
.Complete(new RegexFilter(wordRegex));
|
|
|
|
var scan = new TesseractProcessor(TesseractConfiguration);
|
|
|
|
return new ProcessorChainConfiguration<MagickImage, ScanResult>()
|
|
.Use(preprocessing)
|
|
.Use(new ProcessingEvent<MagickImage>(OnProcessing))
|
|
.Use(scan)
|
|
.Use(new ProcessingEvent<ScanResult>(OnProcessed))
|
|
.Complete(postprocessing);
|
|
}
|
|
|
|
protected virtual void OnProcessing(IProcessor sender, ICollection<MagickImage> inputs)
|
|
{
|
|
}
|
|
|
|
protected virtual void OnProcessed(IProcessor sender, ICollection<ScanResult> inputs)
|
|
{
|
|
}
|
|
}
|
|
}
|