checkpoint
This commit is contained in:
@@ -0,0 +1,61 @@
|
||||
namespace ReportGenerator.Models;
|
||||
|
||||
internal readonly struct CharacterStats
|
||||
{
|
||||
public string Reference { get; }
|
||||
|
||||
public string Value { get; }
|
||||
|
||||
public double CharacterError { get; }
|
||||
|
||||
public CharacterStats(string reference)
|
||||
{
|
||||
Reference = reference;
|
||||
Value = string.Empty;
|
||||
CharacterError = double.PositiveInfinity;
|
||||
}
|
||||
|
||||
public CharacterStats(string reference, string value)
|
||||
{
|
||||
Value = value;
|
||||
Reference = reference;
|
||||
|
||||
CharacterError = CalculateCer(reference, value);
|
||||
}
|
||||
|
||||
private static double CalculateCer(string s1, string s2)
|
||||
{
|
||||
var distance = new int[s1.Length + 1, s2.Length + 1];
|
||||
|
||||
for (var i = 0; i <= s1.Length; i++)
|
||||
{
|
||||
distance[i, 0] = i;
|
||||
}
|
||||
|
||||
for (var j = 0; j <= s2.Length; j++)
|
||||
{
|
||||
distance[0, j] = j;
|
||||
}
|
||||
|
||||
for (var i = 1; i <= s1.Length; i++)
|
||||
{
|
||||
for (var j = 1; j <= s2.Length; j++)
|
||||
{
|
||||
var cost = s2[j - 1] == s1[i - 1] ? 0 : 1;
|
||||
|
||||
var c1 = Math.Min(distance[i - 1, j] + 1, distance[i, j - 1] + 1);
|
||||
var c2 = distance[i - 1, j - 1] + cost;
|
||||
distance[i, j] = Math.Min(c1, c2);
|
||||
}
|
||||
}
|
||||
|
||||
return distance[s1.Length, s2.Length];
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public override string ToString()
|
||||
{
|
||||
var value = string.IsNullOrEmpty(Value) ? "`null`" : Value;
|
||||
return $"{value} ({CharacterError})";
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,52 @@
|
||||
namespace ReportGenerator.Models;
|
||||
|
||||
internal readonly struct ImageStats
|
||||
{
|
||||
public string ImageName { get; } = string.Empty;
|
||||
|
||||
public ICollection<string> Reference { get; } = Array.Empty<string>();
|
||||
public ICollection<ProcessorStat> Stats { get; } = Array.Empty<ProcessorStat>();
|
||||
|
||||
public ImageStats(
|
||||
string imageName,
|
||||
ICollection<string> taggedWords,
|
||||
IEnumerable<ScannedResultInfo> scanResult
|
||||
)
|
||||
{
|
||||
Reference = taggedWords;
|
||||
ImageName = imageName;
|
||||
Stats = scanResult
|
||||
.Select(t => new ProcessorStat(t.ProcessorName, taggedWords, t.GetWords()))
|
||||
.ToArray();
|
||||
}
|
||||
|
||||
|
||||
public IEnumerable<IEnumerable<string>> ToTable()
|
||||
{
|
||||
// Title
|
||||
yield return Stats.Select(s => s.ProcessorName).Prepend("Reference");
|
||||
|
||||
// Spacer
|
||||
yield return Stats.Select(s => "---").Prepend("---");
|
||||
|
||||
// Content
|
||||
foreach (var reference in Reference)
|
||||
{
|
||||
yield return Stats.SelectMany(s => s.ToRow(reference)).Prepend(reference);
|
||||
}
|
||||
|
||||
// Spacer
|
||||
yield return Stats.Select(s => "---").Prepend("---");
|
||||
|
||||
// Summaries
|
||||
yield return Stats
|
||||
.Select(s => s.CharacterStats.Average(s => s.CharacterError).ToString("F2"))
|
||||
.Prepend("CER (avg)");
|
||||
yield return Stats
|
||||
.Select(s => s.CharacterStats.Sum(s => s.CharacterError).ToString("F2"))
|
||||
.Prepend("CER (sum)");
|
||||
yield return Stats
|
||||
.Select(s => s.WordError.ToString("F2"))
|
||||
.Prepend("WER");
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,85 @@
|
||||
namespace ReportGenerator.Models;
|
||||
|
||||
internal readonly struct ProcessorStat
|
||||
{
|
||||
public string ProcessorName { get; } = string.Empty;
|
||||
public ICollection<CharacterStats> CharacterStats { get; } = Array.Empty<CharacterStats>();
|
||||
public double WordError { get; } = double.PositiveInfinity;
|
||||
|
||||
public ProcessorStat(
|
||||
string processorName,
|
||||
ICollection<string> reference,
|
||||
ICollection<string> values
|
||||
)
|
||||
{
|
||||
ProcessorName = processorName;
|
||||
|
||||
WordError = CalculateWer(
|
||||
reference,
|
||||
values
|
||||
);
|
||||
|
||||
CharacterStats = GetCharacterStat(
|
||||
reference,
|
||||
values
|
||||
).ToArray();
|
||||
}
|
||||
|
||||
public IEnumerable<string> ToRow(string word) => CharacterStats
|
||||
.Where(s => string.Equals(s.Reference, word))
|
||||
.Select(s => s.ToString());
|
||||
|
||||
/// <summary>
|
||||
/// Finds the smallest possible CER by calculating the levenshtein
|
||||
/// distance to every word and returning the most similar combination
|
||||
/// </summary>
|
||||
/// <returns></returns>
|
||||
private static IEnumerable<CharacterStats> GetCharacterStat(
|
||||
IEnumerable<string> reference,
|
||||
ICollection<string> values
|
||||
)
|
||||
{
|
||||
foreach (var refValue in reference)
|
||||
{
|
||||
CharacterStats result = new CharacterStats(refValue);
|
||||
|
||||
foreach (var value in values)
|
||||
{
|
||||
var stat = new CharacterStats(refValue, value);
|
||||
if (stat.CharacterError > result.CharacterError)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
result = stat;
|
||||
|
||||
if (stat.CharacterError == 0)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
yield return result;
|
||||
}
|
||||
}
|
||||
|
||||
static double CalculateWer(ICollection<string> expected, ICollection<string> actual)
|
||||
{
|
||||
// Amount of words that need to be substituted to match the original
|
||||
int substitutions = expected
|
||||
.Zip(
|
||||
actual,
|
||||
(e, a) => string.Equals(e, a) ? 0 : 1
|
||||
)
|
||||
.Sum();
|
||||
|
||||
// todo this isn't correct i think
|
||||
// Amount of words dropped from the original
|
||||
int deletions = expected.Except(actual).Count();
|
||||
|
||||
// Amount of extra words added compared to the original
|
||||
int insertions = actual.Except(expected).Count();
|
||||
|
||||
return (substitutions + deletions + insertions) / (double)expected.Count;
|
||||
}
|
||||
}
|
||||
+1
-1
@@ -1,7 +1,7 @@
|
||||
using System.Text.Json;
|
||||
using System.Text.RegularExpressions;
|
||||
|
||||
namespace ReportGenerator;
|
||||
namespace ReportGenerator.Models;
|
||||
|
||||
internal struct ScannedResultInfo
|
||||
{
|
||||
@@ -0,0 +1,49 @@
|
||||
namespace ReportGenerator.Models;
|
||||
|
||||
internal readonly struct TableInfo
|
||||
{
|
||||
public IEnumerable<IEnumerable<string>> Rows { get; } = Enumerable.Empty<IEnumerable<string>>();
|
||||
|
||||
public string Title { get; init; } = string.Empty;
|
||||
|
||||
public string RowStart { get; init; } = string.Empty;
|
||||
public string RowEnd { get; init; } = string.Empty;
|
||||
|
||||
public string ColumnStart { get; init; } = string.Empty;
|
||||
public string ColumnEnd { get; init; } = string.Empty;
|
||||
|
||||
public TableInfo(IEnumerable<IEnumerable<string>> rows)
|
||||
{
|
||||
Rows = rows;
|
||||
}
|
||||
|
||||
#region Overrides of ValueType
|
||||
|
||||
/// <inheritdoc />
|
||||
public override string ToString()
|
||||
{
|
||||
string result = string.Empty;
|
||||
|
||||
// Title
|
||||
result += Title;
|
||||
|
||||
// Body
|
||||
foreach (var row in Rows)
|
||||
{
|
||||
result += RowStart;
|
||||
|
||||
foreach (var column in row)
|
||||
{
|
||||
result += ColumnStart;
|
||||
result += column;
|
||||
result += ColumnEnd;
|
||||
}
|
||||
|
||||
result += RowEnd;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
#endregion
|
||||
}
|
||||
@@ -0,0 +1,31 @@
|
||||
using System.Text.Json;
|
||||
|
||||
namespace ReportGenerator.Models;
|
||||
|
||||
internal struct TagFileInfo
|
||||
{
|
||||
public string Path { get; private init; }
|
||||
|
||||
public string ImageName { get; set; }
|
||||
|
||||
public ICollection<string> GetWords()
|
||||
{
|
||||
using var file = File.OpenRead(Path);
|
||||
return JsonDocument
|
||||
.Parse(file)
|
||||
.RootElement
|
||||
.GetProperty("words")
|
||||
.EnumerateArray()
|
||||
.Select(w => w.GetString() ?? throw new Exception("Cannot parse null words"))
|
||||
.ToArray();
|
||||
}
|
||||
|
||||
public static TagFileInfo FromPath(string path) => new()
|
||||
{
|
||||
Path = path,
|
||||
ImageName = System.IO.Path.GetFileNameWithoutExtension(path),
|
||||
};
|
||||
|
||||
/// <inheritdoc />
|
||||
public override string ToString() => ImageName;
|
||||
}
|
||||
@@ -1,167 +1,52 @@
|
||||
namespace ReportGenerator;
|
||||
using ReportGenerator.Models;
|
||||
|
||||
internal struct CharacterErrorInfo
|
||||
{
|
||||
public string TaggedWord { get; }
|
||||
public string? ScannedWord { get; set; } = null;
|
||||
public double CharacterError { get; set; } = double.PositiveInfinity;
|
||||
|
||||
public CharacterErrorInfo(string taggedWord) => TaggedWord = taggedWord;
|
||||
}
|
||||
|
||||
internal struct WordErrorInfo
|
||||
{
|
||||
public double WordError { get; set; } = double.PositiveInfinity;
|
||||
|
||||
public ICollection<CharacterErrorInfo> Words { get; } = new List<CharacterErrorInfo>();
|
||||
|
||||
public double CharacterErrorAvg => Words.Average(i => i.CharacterError);
|
||||
|
||||
public WordErrorInfo()
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
internal struct ScanTable
|
||||
{
|
||||
public string ImageName { get; set; }
|
||||
|
||||
public ICollection<ScanTableRow> Scans { get; set; }
|
||||
}
|
||||
|
||||
internal struct ScanTableRow
|
||||
{
|
||||
public string ScannerName { get; set; }
|
||||
|
||||
public ICollection<string> ScannedWords { get; set; }
|
||||
}
|
||||
namespace ReportGenerator;
|
||||
|
||||
internal static class Program
|
||||
{
|
||||
internal static void Main(string[] args)
|
||||
{
|
||||
var errorInfos = new List<(ScannedResultInfo scan, WordErrorInfo error)>();
|
||||
|
||||
var tagFileInfos = GetTagFileInfos(args[0]);
|
||||
var scanFileInfos = GetScanFileInfos(args[1]).ToLookup(i => i.ImageName);
|
||||
var scanFileInfos = GetScanFileInfos(args[1]);
|
||||
|
||||
Directory.CreateDirectory("reports");
|
||||
|
||||
foreach (var tagFileInfo in tagFileInfos)
|
||||
{
|
||||
var taggedWords = tagFileInfo.GetWords();
|
||||
var stats = Scan(tagFileInfos, scanFileInfos);
|
||||
|
||||
foreach (var scanFileInfo in scanFileInfos[tagFileInfo.ImageName])
|
||||
foreach (var stat in stats)
|
||||
{
|
||||
var tableFields = stat.ToTable();
|
||||
var tableInfo = new TableInfo(tableFields)
|
||||
{
|
||||
var scannedWords = scanFileInfo.GetWords();
|
||||
if (!scannedWords.Any())
|
||||
{
|
||||
continue;
|
||||
}
|
||||
Title = stat.ImageName + Environment.NewLine,
|
||||
RowStart = " | ",
|
||||
RowEnd = Environment.NewLine,
|
||||
ColumnEnd = " | "
|
||||
};
|
||||
|
||||
// Calculate WER by comparing all tagged with all scanned words
|
||||
var wordErrorInfo = new WordErrorInfo
|
||||
{
|
||||
WordError = CalculateWer(taggedWords, scannedWords),
|
||||
};
|
||||
var tableStr = tableInfo.ToString();
|
||||
|
||||
// Calculate CER for each tagged word
|
||||
foreach (var taggedWord in taggedWords)
|
||||
{
|
||||
var characterErrorInfo = new CharacterErrorInfo(taggedWord);
|
||||
|
||||
foreach (var scannedWord in scannedWords)
|
||||
{
|
||||
// Calculates the levenshtein distance to every word and returns the most similar combination
|
||||
var err = CalculateCer(taggedWord, scannedWord);
|
||||
|
||||
if (err < characterErrorInfo.CharacterError)
|
||||
{
|
||||
characterErrorInfo.ScannedWord = scannedWord;
|
||||
characterErrorInfo.CharacterError = err;
|
||||
|
||||
if (err == 0)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
wordErrorInfo.Words.Add(characterErrorInfo);
|
||||
}
|
||||
|
||||
errorInfos.Add((scanFileInfo, wordErrorInfo));
|
||||
}
|
||||
Console.WriteLine();
|
||||
Console.WriteLine();
|
||||
Console.WriteLine(tableStr);
|
||||
Console.WriteLine();
|
||||
Console.WriteLine();
|
||||
}
|
||||
|
||||
// Somewhat off based on the amount of expected words
|
||||
// If a processor did scan nothing at all this value can be very low
|
||||
var bestCharErrorProcessor = errorInfos
|
||||
.GroupBy(e => e.scan.ProcessorName, e => e.error)
|
||||
.Select(g => (g.Key, g.Average(i => i.CharacterErrorAvg)))
|
||||
.OrderBy(g => g.Item2)
|
||||
.ToArray();
|
||||
|
||||
// Same here but with less impact
|
||||
var bestWordErrorProcessor = errorInfos
|
||||
.GroupBy(e => e.scan.ProcessorName, e => e.error)
|
||||
.Select(g => (g.Key, g.Average(i => i.WordError)))
|
||||
.OrderBy(g => g.Item2)
|
||||
.ToArray();
|
||||
}
|
||||
|
||||
static double CalculateCer(string s1, string s2)
|
||||
private static IEnumerable<ImageStats> Scan(
|
||||
IEnumerable<TagFileInfo> tagFileInfos,
|
||||
IEnumerable<ScannedResultInfo> scanFileInfos
|
||||
)
|
||||
{
|
||||
if (string.IsNullOrEmpty(s1) || string.IsNullOrEmpty(s2))
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
var distance = new int[s1.Length + 1, s2.Length + 1];
|
||||
|
||||
for (var i = 0; i <= s1.Length; i++)
|
||||
{
|
||||
distance[i, 0] = i;
|
||||
}
|
||||
|
||||
for (var j = 0; j <= s2.Length; j++)
|
||||
{
|
||||
distance[0, j] = j;
|
||||
}
|
||||
|
||||
for (var i = 1; i <= s1.Length; i++)
|
||||
{
|
||||
for (var j = 1; j <= s2.Length; j++)
|
||||
{
|
||||
var cost = s2[j - 1] == s1[i - 1] ? 0 : 1;
|
||||
|
||||
var c1 = Math.Min(distance[i - 1, j] + 1, distance[i, j - 1] + 1);
|
||||
var c2 = distance[i - 1, j - 1] + cost;
|
||||
distance[i, j] = Math.Min(c1, c2);
|
||||
}
|
||||
}
|
||||
|
||||
return distance[s1.Length, s2.Length];
|
||||
var scanFileLookup = scanFileInfos.ToLookup(i => i.ImageName);
|
||||
return tagFileInfos.Select(i => new ImageStats(
|
||||
i.ImageName,
|
||||
i.GetWords(),
|
||||
scanFileLookup[i.ImageName]
|
||||
));
|
||||
}
|
||||
|
||||
static double CalculateWer(ICollection<string> expected, ICollection<string> actual)
|
||||
{
|
||||
// Amount of words that need to be substituted to match the original
|
||||
int substitutions = expected
|
||||
.Zip(
|
||||
actual,
|
||||
(e, a) => string.Equals(e, a) ? 0 : 1
|
||||
)
|
||||
.Sum();
|
||||
|
||||
// Amount of words dropped from the original
|
||||
int deletions = expected.Except(actual).Count();
|
||||
|
||||
// Amount of extra words added compared to the original
|
||||
int insertions = actual.Except(expected).Count();
|
||||
|
||||
return (substitutions + deletions + insertions) / (double)expected.Count;
|
||||
}
|
||||
|
||||
private static IEnumerable<TagFileInfo> GetTagFileInfos(string dir)
|
||||
{
|
||||
|
||||
@@ -7,4 +7,8 @@
|
||||
<Nullable>enable</Nullable>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Microsoft.EntityFrameworkCore" Version="7.0.5" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
|
||||
@@ -1,31 +0,0 @@
|
||||
using System.Text.Json;
|
||||
|
||||
namespace ReportGenerator;
|
||||
|
||||
internal struct TagFileInfo
|
||||
{
|
||||
public string Path { get; private init; }
|
||||
|
||||
public string ImageName { get; set; }
|
||||
|
||||
public ICollection<string> GetWords()
|
||||
{
|
||||
using var file = File.OpenRead(Path);
|
||||
return JsonDocument
|
||||
.Parse(file)
|
||||
.RootElement
|
||||
.GetProperty("words")
|
||||
.EnumerateArray()
|
||||
.Select(w => w.GetString() ?? throw new Exception("Cannot parse null words"))
|
||||
.ToArray();
|
||||
}
|
||||
|
||||
public static TagFileInfo FromPath(string path) => new()
|
||||
{
|
||||
Path = path,
|
||||
ImageName = System.IO.Path.GetFileNameWithoutExtension(path),
|
||||
};
|
||||
|
||||
/// <inheritdoc />
|
||||
public override string ToString() => ImageName;
|
||||
}
|
||||
Reference in New Issue
Block a user