Improved postprocessing analysis

This commit is contained in:
Simon
2024-01-11 00:14:20 +01:00
parent 6eda25fce5
commit 41aa4bf151
2 changed files with 53 additions and 14 deletions
@@ -1,5 +1,6 @@
// See https://aka.ms/new-console-template for more information
using System.Text.Json;
using System.Text.RegularExpressions;
using ImageMagick;
using Ocr.Tesseract;
@@ -10,22 +11,25 @@ using Ocr.Tesseract.Screenshots.Threshold;
using Process.Abstract.Configuration;
using Process.Interface;
var wordRegex = new Regex(
@"[\w'\-]{2,}",
RegexOptions.Compiled | RegexOptions.Multiline | RegexOptions.IgnoreCase
);
var tesseractConfig = new TesseractScreenshotConfiguration
{
DataPath = "tessdata",
Languages = new[] { "eng", "deu" }
};
Console.WriteLine("Hello, World!");
var jsonOptions = new JsonSerializerOptions() { WriteIndented = true };
var processor = MakeProcessor();
processor.Process(new[] { new MagickImage(args.Single()) });
return;
IProcessorChain<MagickImage, ScanResult> MakeProcessor()
{
var wordRegex = new Regex(
@"[\w'\-]{2,}",
RegexOptions.Compiled | RegexOptions.Multiline | RegexOptions.IgnoreCase
);
var tesseractConfig = new TesseractScreenshotConfiguration
{
DataPath = "tessdata",
Languages = new[] { "eng", "deu" }
};
var preprocessing = new ProcessorChainConfiguration<MagickImage, MagickImage>()
.Use(new CloneImageProcessor())
.Use(new ResizeProcessor(FilterType.Lanczos2Sharp, PixelInterpolateMethod.Mesh))
@@ -36,11 +40,15 @@ IProcessorChain<MagickImage, ScanResult> MakeProcessor()
.Complete(new NegateCloneProcessor());
var postprocessing = new ProcessorChainConfiguration<ScanResult, ScanResult>()
.Use(new ProcessingEvent<ScanResult>((_, data) => WriteToFile(data, "source")))
.Use(new ConfidenceFilter(50))
// todo insert processing events and write to json files
.Use(new ProcessingEvent<ScanResult>((_, data) => WriteToFile(data, "confidence")))
.Use(new ToLowerProcessor())
.Use(new ProcessingEvent<ScanResult>((_, data) => WriteToFile(data, "normalize")))
.Use(new DuplicateFilter())
.Complete(new RegexFilter(wordRegex));
.Use(new ProcessingEvent<ScanResult>((_, data) => WriteToFile(data, "duplicates")))
.Use(new RegexFilter(wordRegex))
.Complete(new ProcessingEvent<ScanResult>((_, data) => WriteToFile(data, "regex")));
var scan = new TesseractProcessor(tesseractConfig);
@@ -49,3 +57,25 @@ IProcessorChain<MagickImage, ScanResult> MakeProcessor()
.Use(scan)
.Complete(postprocessing);
}
void WriteToFile(ICollection<ScanResult> data, string name)
{
using var file1 = File.Open($"{name}.detailed.json", FileMode.Create);
JsonSerializer.Serialize(file1, data.Select(WordInfo.Create), jsonOptions);
using var file2 = File.Open($"{name}.json", FileMode.Create);
JsonSerializer.Serialize(file2, data.Select(d => d.Word.Text), jsonOptions);
}
struct WordInfo
{
public string Text { get; set; }
public double Confidence { get; set; }
public static WordInfo Create(ScanResult result) => new()
{
Text = result.Word.Text,
Confidence = result.Word.Confidence
};
}
@@ -0,0 +1,9 @@
{
"profiles": {
"Refresh thesis results": {
"commandName": "Project",
"commandLineArgs": "source.png",
"workingDirectory": "C:\\Users\\Simon\\Documents\\Userdata\\FH\\SEM5\\BA\\bsc\\include\\postprocessing"
}
}
}