This repository has been archived on 2024-06-04. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
2024-01-11 00:16:41 +01:00

85 lines
2.9 KiB
C#

// See https://aka.ms/new-console-template for more information
using System.Text.Json;
using System.Text.RegularExpressions;
using ImageMagick;
using Ocr.Tesseract;
using Ocr.Tesseract.Models;
using Ocr.Tesseract.Screenshots;
using Ocr.Tesseract.Screenshots.Configuration;
using Ocr.Tesseract.Screenshots.Threshold;
using Process.Abstract.Configuration;
using Process.Interface;
var wordRegex = new Regex(
@"[\w'\-]{2,}",
RegexOptions.Compiled | RegexOptions.Multiline | RegexOptions.IgnoreCase
);
var tesseractConfig = new TesseractScreenshotConfiguration
{
DataPath = "tessdata",
Languages = new[] { "eng", "deu" }
};
var jsonOptions = new JsonSerializerOptions()
{
WriteIndented = true,
Encoder = System.Text.Encodings.Web.JavaScriptEncoder.UnsafeRelaxedJsonEscaping
};
var processor = MakeProcessor();
processor.Process(new[] { new MagickImage(args.Single()) });
return;
IProcessorChain<MagickImage, ScanResult> MakeProcessor()
{
var preprocessing = new ProcessorChainConfiguration<MagickImage, MagickImage>()
.Use(new CloneImageProcessor())
.Use(new ResizeProcessor(FilterType.Lanczos2Sharp, PixelInterpolateMethod.Mesh))
.Use(new NormalizeProcessor())
.Use(new ThresholdAdaptiveProcessor(15, 15))
.Use(new AddBorderProcessor(10))
.Use(new BinarizeProcessor())
.Complete(new NegateCloneProcessor());
var postprocessing = new ProcessorChainConfiguration<ScanResult, ScanResult>()
.Use(new ProcessingEvent<ScanResult>((_, data) => WriteToFile(data, "source")))
.Use(new ConfidenceFilter(50))
.Use(new ProcessingEvent<ScanResult>((_, data) => WriteToFile(data, "confidence")))
.Use(new ToLowerProcessor())
.Use(new ProcessingEvent<ScanResult>((_, data) => WriteToFile(data, "normalize")))
.Use(new DuplicateFilter())
.Use(new ProcessingEvent<ScanResult>((_, data) => WriteToFile(data, "duplicates")))
.Use(new RegexFilter(wordRegex))
.Complete(new ProcessingEvent<ScanResult>((_, data) => WriteToFile(data, "regex")));
var scan = new TesseractProcessor(tesseractConfig);
return new ProcessorChainConfiguration<MagickImage, ScanResult>()
.Use(preprocessing)
.Use(scan)
.Complete(postprocessing);
}
void WriteToFile(ICollection<ScanResult> data, string name)
{
using var file1 = File.Open($"{name}.detailed.json", FileMode.Create);
JsonSerializer.Serialize(file1, data.Select(WordInfo.Create), jsonOptions);
using var file2 = File.Open($"{name}.json", FileMode.Create);
JsonSerializer.Serialize(file2, data.Select(d => d.Word.Text), jsonOptions);
}
struct WordInfo
{
public string Text { get; set; }
public double Confidence { get; set; }
public static WordInfo Create(ScanResult result) => new()
{
Text = result.Word.Text,
Confidence = result.Word.Confidence
};
}