85 lines
2.9 KiB
C#
85 lines
2.9 KiB
C#
// See https://aka.ms/new-console-template for more information
|
|
|
|
using System.Text.Json;
|
|
using System.Text.RegularExpressions;
|
|
using ImageMagick;
|
|
using Ocr.Tesseract;
|
|
using Ocr.Tesseract.Models;
|
|
using Ocr.Tesseract.Screenshots;
|
|
using Ocr.Tesseract.Screenshots.Configuration;
|
|
using Ocr.Tesseract.Screenshots.Threshold;
|
|
using Process.Abstract.Configuration;
|
|
using Process.Interface;
|
|
|
|
var wordRegex = new Regex(
|
|
@"[\w'\-]{2,}",
|
|
RegexOptions.Compiled | RegexOptions.Multiline | RegexOptions.IgnoreCase
|
|
);
|
|
var tesseractConfig = new TesseractScreenshotConfiguration
|
|
{
|
|
DataPath = "tessdata",
|
|
Languages = new[] { "eng", "deu" }
|
|
};
|
|
|
|
var jsonOptions = new JsonSerializerOptions()
|
|
{
|
|
WriteIndented = true,
|
|
Encoder = System.Text.Encodings.Web.JavaScriptEncoder.UnsafeRelaxedJsonEscaping
|
|
};
|
|
|
|
var processor = MakeProcessor();
|
|
processor.Process(new[] { new MagickImage(args.Single()) });
|
|
|
|
return;
|
|
|
|
IProcessorChain<MagickImage, ScanResult> MakeProcessor()
|
|
{
|
|
var preprocessing = new ProcessorChainConfiguration<MagickImage, MagickImage>()
|
|
.Use(new CloneImageProcessor())
|
|
.Use(new ResizeProcessor(FilterType.Lanczos2Sharp, PixelInterpolateMethod.Mesh))
|
|
.Use(new NormalizeProcessor())
|
|
.Use(new ThresholdAdaptiveProcessor(15, 15))
|
|
.Use(new AddBorderProcessor(10))
|
|
.Use(new BinarizeProcessor())
|
|
.Complete(new NegateCloneProcessor());
|
|
|
|
var postprocessing = new ProcessorChainConfiguration<ScanResult, ScanResult>()
|
|
.Use(new ProcessingEvent<ScanResult>((_, data) => WriteToFile(data, "source")))
|
|
.Use(new ConfidenceFilter(50))
|
|
.Use(new ProcessingEvent<ScanResult>((_, data) => WriteToFile(data, "confidence")))
|
|
.Use(new ToLowerProcessor())
|
|
.Use(new ProcessingEvent<ScanResult>((_, data) => WriteToFile(data, "normalize")))
|
|
.Use(new DuplicateFilter())
|
|
.Use(new ProcessingEvent<ScanResult>((_, data) => WriteToFile(data, "duplicates")))
|
|
.Use(new RegexFilter(wordRegex))
|
|
.Complete(new ProcessingEvent<ScanResult>((_, data) => WriteToFile(data, "regex")));
|
|
|
|
var scan = new TesseractProcessor(tesseractConfig);
|
|
|
|
return new ProcessorChainConfiguration<MagickImage, ScanResult>()
|
|
.Use(preprocessing)
|
|
.Use(scan)
|
|
.Complete(postprocessing);
|
|
}
|
|
|
|
void WriteToFile(ICollection<ScanResult> data, string name)
|
|
{
|
|
using var file1 = File.Open($"{name}.detailed.json", FileMode.Create);
|
|
JsonSerializer.Serialize(file1, data.Select(WordInfo.Create), jsonOptions);
|
|
|
|
using var file2 = File.Open($"{name}.json", FileMode.Create);
|
|
JsonSerializer.Serialize(file2, data.Select(d => d.Word.Text), jsonOptions);
|
|
}
|
|
|
|
struct WordInfo
|
|
{
|
|
public string Text { get; set; }
|
|
|
|
public double Confidence { get; set; }
|
|
|
|
public static WordInfo Create(ScanResult result) => new()
|
|
{
|
|
Text = result.Word.Text,
|
|
Confidence = result.Word.Confidence
|
|
};
|
|
} |