checkpoint

This commit is contained in:
Simon Gruber
2023-11-20 14:26:00 +01:00
parent d8008e4f05
commit 9ac01e6b12
9 changed files with 312 additions and 176 deletions
@@ -0,0 +1,61 @@
namespace ReportGenerator.Models;
internal readonly struct CharacterStats
{
public string Reference { get; }
public string Value { get; }
public double CharacterError { get; }
public CharacterStats(string reference)
{
Reference = reference;
Value = string.Empty;
CharacterError = double.PositiveInfinity;
}
public CharacterStats(string reference, string value)
{
Value = value;
Reference = reference;
CharacterError = CalculateCer(reference, value);
}
private static double CalculateCer(string s1, string s2)
{
var distance = new int[s1.Length + 1, s2.Length + 1];
for (var i = 0; i <= s1.Length; i++)
{
distance[i, 0] = i;
}
for (var j = 0; j <= s2.Length; j++)
{
distance[0, j] = j;
}
for (var i = 1; i <= s1.Length; i++)
{
for (var j = 1; j <= s2.Length; j++)
{
var cost = s2[j - 1] == s1[i - 1] ? 0 : 1;
var c1 = Math.Min(distance[i - 1, j] + 1, distance[i, j - 1] + 1);
var c2 = distance[i - 1, j - 1] + cost;
distance[i, j] = Math.Min(c1, c2);
}
}
return distance[s1.Length, s2.Length];
}
/// <inheritdoc />
public override string ToString()
{
var value = string.IsNullOrEmpty(Value) ? "`null`" : Value;
return $"{value} ({CharacterError})";
}
}
@@ -0,0 +1,52 @@
namespace ReportGenerator.Models;
internal readonly struct ImageStats
{
public string ImageName { get; } = string.Empty;
public ICollection<string> Reference { get; } = Array.Empty<string>();
public ICollection<ProcessorStat> Stats { get; } = Array.Empty<ProcessorStat>();
public ImageStats(
string imageName,
ICollection<string> taggedWords,
IEnumerable<ScannedResultInfo> scanResult
)
{
Reference = taggedWords;
ImageName = imageName;
Stats = scanResult
.Select(t => new ProcessorStat(t.ProcessorName, taggedWords, t.GetWords()))
.ToArray();
}
public IEnumerable<IEnumerable<string>> ToTable()
{
// Title
yield return Stats.Select(s => s.ProcessorName).Prepend("Reference");
// Spacer
yield return Stats.Select(s => "---").Prepend("---");
// Content
foreach (var reference in Reference)
{
yield return Stats.SelectMany(s => s.ToRow(reference)).Prepend(reference);
}
// Spacer
yield return Stats.Select(s => "---").Prepend("---");
// Summaries
yield return Stats
.Select(s => s.CharacterStats.Average(s => s.CharacterError).ToString("F2"))
.Prepend("CER (avg)");
yield return Stats
.Select(s => s.CharacterStats.Sum(s => s.CharacterError).ToString("F2"))
.Prepend("CER (sum)");
yield return Stats
.Select(s => s.WordError.ToString("F2"))
.Prepend("WER");
}
}
@@ -0,0 +1,85 @@
namespace ReportGenerator.Models;
internal readonly struct ProcessorStat
{
public string ProcessorName { get; } = string.Empty;
public ICollection<CharacterStats> CharacterStats { get; } = Array.Empty<CharacterStats>();
public double WordError { get; } = double.PositiveInfinity;
public ProcessorStat(
string processorName,
ICollection<string> reference,
ICollection<string> values
)
{
ProcessorName = processorName;
WordError = CalculateWer(
reference,
values
);
CharacterStats = GetCharacterStat(
reference,
values
).ToArray();
}
public IEnumerable<string> ToRow(string word) => CharacterStats
.Where(s => string.Equals(s.Reference, word))
.Select(s => s.ToString());
/// <summary>
/// Finds the smallest possible CER by calculating the levenshtein
/// distance to every word and returning the most similar combination
/// </summary>
/// <returns></returns>
private static IEnumerable<CharacterStats> GetCharacterStat(
IEnumerable<string> reference,
ICollection<string> values
)
{
foreach (var refValue in reference)
{
CharacterStats result = new CharacterStats(refValue);
foreach (var value in values)
{
var stat = new CharacterStats(refValue, value);
if (stat.CharacterError > result.CharacterError)
{
continue;
}
result = stat;
if (stat.CharacterError == 0)
{
break;
}
}
yield return result;
}
}
static double CalculateWer(ICollection<string> expected, ICollection<string> actual)
{
// Amount of words that need to be substituted to match the original
int substitutions = expected
.Zip(
actual,
(e, a) => string.Equals(e, a) ? 0 : 1
)
.Sum();
// todo this isn't correct i think
// Amount of words dropped from the original
int deletions = expected.Except(actual).Count();
// Amount of extra words added compared to the original
int insertions = actual.Except(expected).Count();
return (substitutions + deletions + insertions) / (double)expected.Count;
}
}
@@ -1,7 +1,7 @@
using System.Text.Json;
using System.Text.RegularExpressions;
namespace ReportGenerator;
namespace ReportGenerator.Models;
internal struct ScannedResultInfo
{
@@ -0,0 +1,49 @@
namespace ReportGenerator.Models;
internal readonly struct TableInfo
{
public IEnumerable<IEnumerable<string>> Rows { get; } = Enumerable.Empty<IEnumerable<string>>();
public string Title { get; init; } = string.Empty;
public string RowStart { get; init; } = string.Empty;
public string RowEnd { get; init; } = string.Empty;
public string ColumnStart { get; init; } = string.Empty;
public string ColumnEnd { get; init; } = string.Empty;
public TableInfo(IEnumerable<IEnumerable<string>> rows)
{
Rows = rows;
}
#region Overrides of ValueType
/// <inheritdoc />
public override string ToString()
{
string result = string.Empty;
// Title
result += Title;
// Body
foreach (var row in Rows)
{
result += RowStart;
foreach (var column in row)
{
result += ColumnStart;
result += column;
result += ColumnEnd;
}
result += RowEnd;
}
return result;
}
#endregion
}
@@ -0,0 +1,31 @@
using System.Text.Json;
namespace ReportGenerator.Models;
internal struct TagFileInfo
{
public string Path { get; private init; }
public string ImageName { get; set; }
public ICollection<string> GetWords()
{
using var file = File.OpenRead(Path);
return JsonDocument
.Parse(file)
.RootElement
.GetProperty("words")
.EnumerateArray()
.Select(w => w.GetString() ?? throw new Exception("Cannot parse null words"))
.ToArray();
}
public static TagFileInfo FromPath(string path) => new()
{
Path = path,
ImageName = System.IO.Path.GetFileNameWithoutExtension(path),
};
/// <inheritdoc />
public override string ToString() => ImageName;
}
+29 -144
View File
@@ -1,167 +1,52 @@
namespace ReportGenerator;
using ReportGenerator.Models;
internal struct CharacterErrorInfo
{
public string TaggedWord { get; }
public string? ScannedWord { get; set; } = null;
public double CharacterError { get; set; } = double.PositiveInfinity;
public CharacterErrorInfo(string taggedWord) => TaggedWord = taggedWord;
}
internal struct WordErrorInfo
{
public double WordError { get; set; } = double.PositiveInfinity;
public ICollection<CharacterErrorInfo> Words { get; } = new List<CharacterErrorInfo>();
public double CharacterErrorAvg => Words.Average(i => i.CharacterError);
public WordErrorInfo()
{
}
}
internal struct ScanTable
{
public string ImageName { get; set; }
public ICollection<ScanTableRow> Scans { get; set; }
}
internal struct ScanTableRow
{
public string ScannerName { get; set; }
public ICollection<string> ScannedWords { get; set; }
}
namespace ReportGenerator;
internal static class Program
{
internal static void Main(string[] args)
{
var errorInfos = new List<(ScannedResultInfo scan, WordErrorInfo error)>();
var tagFileInfos = GetTagFileInfos(args[0]);
var scanFileInfos = GetScanFileInfos(args[1]).ToLookup(i => i.ImageName);
var scanFileInfos = GetScanFileInfos(args[1]);
Directory.CreateDirectory("reports");
foreach (var tagFileInfo in tagFileInfos)
{
var taggedWords = tagFileInfo.GetWords();
var stats = Scan(tagFileInfos, scanFileInfos);
foreach (var scanFileInfo in scanFileInfos[tagFileInfo.ImageName])
foreach (var stat in stats)
{
var tableFields = stat.ToTable();
var tableInfo = new TableInfo(tableFields)
{
var scannedWords = scanFileInfo.GetWords();
if (!scannedWords.Any())
{
continue;
}
Title = stat.ImageName + Environment.NewLine,
RowStart = " | ",
RowEnd = Environment.NewLine,
ColumnEnd = " | "
};
// Calculate WER by comparing all tagged with all scanned words
var wordErrorInfo = new WordErrorInfo
{
WordError = CalculateWer(taggedWords, scannedWords),
};
var tableStr = tableInfo.ToString();
// Calculate CER for each tagged word
foreach (var taggedWord in taggedWords)
{
var characterErrorInfo = new CharacterErrorInfo(taggedWord);
foreach (var scannedWord in scannedWords)
{
// Calculates the levenshtein distance to every word and returns the most similar combination
var err = CalculateCer(taggedWord, scannedWord);
if (err < characterErrorInfo.CharacterError)
{
characterErrorInfo.ScannedWord = scannedWord;
characterErrorInfo.CharacterError = err;
if (err == 0)
{
break;
}
}
}
wordErrorInfo.Words.Add(characterErrorInfo);
}
errorInfos.Add((scanFileInfo, wordErrorInfo));
}
Console.WriteLine();
Console.WriteLine();
Console.WriteLine(tableStr);
Console.WriteLine();
Console.WriteLine();
}
// Somewhat off based on the amount of expected words
// If a processor did scan nothing at all this value can be very low
var bestCharErrorProcessor = errorInfos
.GroupBy(e => e.scan.ProcessorName, e => e.error)
.Select(g => (g.Key, g.Average(i => i.CharacterErrorAvg)))
.OrderBy(g => g.Item2)
.ToArray();
// Same here but with less impact
var bestWordErrorProcessor = errorInfos
.GroupBy(e => e.scan.ProcessorName, e => e.error)
.Select(g => (g.Key, g.Average(i => i.WordError)))
.OrderBy(g => g.Item2)
.ToArray();
}
static double CalculateCer(string s1, string s2)
private static IEnumerable<ImageStats> Scan(
IEnumerable<TagFileInfo> tagFileInfos,
IEnumerable<ScannedResultInfo> scanFileInfos
)
{
if (string.IsNullOrEmpty(s1) || string.IsNullOrEmpty(s2))
{
return 0;
}
var distance = new int[s1.Length + 1, s2.Length + 1];
for (var i = 0; i <= s1.Length; i++)
{
distance[i, 0] = i;
}
for (var j = 0; j <= s2.Length; j++)
{
distance[0, j] = j;
}
for (var i = 1; i <= s1.Length; i++)
{
for (var j = 1; j <= s2.Length; j++)
{
var cost = s2[j - 1] == s1[i - 1] ? 0 : 1;
var c1 = Math.Min(distance[i - 1, j] + 1, distance[i, j - 1] + 1);
var c2 = distance[i - 1, j - 1] + cost;
distance[i, j] = Math.Min(c1, c2);
}
}
return distance[s1.Length, s2.Length];
var scanFileLookup = scanFileInfos.ToLookup(i => i.ImageName);
return tagFileInfos.Select(i => new ImageStats(
i.ImageName,
i.GetWords(),
scanFileLookup[i.ImageName]
));
}
static double CalculateWer(ICollection<string> expected, ICollection<string> actual)
{
// Amount of words that need to be substituted to match the original
int substitutions = expected
.Zip(
actual,
(e, a) => string.Equals(e, a) ? 0 : 1
)
.Sum();
// Amount of words dropped from the original
int deletions = expected.Except(actual).Count();
// Amount of extra words added compared to the original
int insertions = actual.Except(expected).Count();
return (substitutions + deletions + insertions) / (double)expected.Count;
}
private static IEnumerable<TagFileInfo> GetTagFileInfos(string dir)
{
@@ -7,4 +7,8 @@
<Nullable>enable</Nullable>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="Microsoft.EntityFrameworkCore" Version="7.0.5" />
</ItemGroup>
</Project>
-31
View File
@@ -1,31 +0,0 @@
using System.Text.Json;
namespace ReportGenerator;
internal struct TagFileInfo
{
public string Path { get; private init; }
public string ImageName { get; set; }
public ICollection<string> GetWords()
{
using var file = File.OpenRead(Path);
return JsonDocument
.Parse(file)
.RootElement
.GetProperty("words")
.EnumerateArray()
.Select(w => w.GetString() ?? throw new Exception("Cannot parse null words"))
.ToArray();
}
public static TagFileInfo FromPath(string path) => new()
{
Path = path,
ImageName = System.IO.Path.GetFileNameWithoutExtension(path),
};
/// <inheritdoc />
public override string ToString() => ImageName;
}