diff --git a/Examples/ReportGenerator/Models/CharacterStats.cs b/Examples/ReportGenerator/Models/CharacterStats.cs new file mode 100644 index 0000000..c2a10f4 --- /dev/null +++ b/Examples/ReportGenerator/Models/CharacterStats.cs @@ -0,0 +1,61 @@ +namespace ReportGenerator.Models; + +internal readonly struct CharacterStats +{ + public string Reference { get; } + + public string Value { get; } + + public double CharacterError { get; } + + public CharacterStats(string reference) + { + Reference = reference; + Value = string.Empty; + CharacterError = double.PositiveInfinity; + } + + public CharacterStats(string reference, string value) + { + Value = value; + Reference = reference; + + CharacterError = CalculateCer(reference, value); + } + + private static double CalculateCer(string s1, string s2) + { + var distance = new int[s1.Length + 1, s2.Length + 1]; + + for (var i = 0; i <= s1.Length; i++) + { + distance[i, 0] = i; + } + + for (var j = 0; j <= s2.Length; j++) + { + distance[0, j] = j; + } + + for (var i = 1; i <= s1.Length; i++) + { + for (var j = 1; j <= s2.Length; j++) + { + var cost = s2[j - 1] == s1[i - 1] ? 0 : 1; + + var c1 = Math.Min(distance[i - 1, j] + 1, distance[i, j - 1] + 1); + var c2 = distance[i - 1, j - 1] + cost; + distance[i, j] = Math.Min(c1, c2); + } + } + + return distance[s1.Length, s2.Length]; + } + + /// + public override string ToString() + { + var value = string.IsNullOrEmpty(Value) ? "`null`" : Value; + return $"{value} ({CharacterError})"; + } +} \ No newline at end of file diff --git a/Examples/ReportGenerator/Models/ImageStats.cs b/Examples/ReportGenerator/Models/ImageStats.cs new file mode 100644 index 0000000..2195731 --- /dev/null +++ b/Examples/ReportGenerator/Models/ImageStats.cs @@ -0,0 +1,52 @@ +namespace ReportGenerator.Models; + +internal readonly struct ImageStats +{ + public string ImageName { get; } = string.Empty; + + public ICollection Reference { get; } = Array.Empty(); + public ICollection Stats { get; } = Array.Empty(); + + public ImageStats( + string imageName, + ICollection taggedWords, + IEnumerable scanResult + ) + { + Reference = taggedWords; + ImageName = imageName; + Stats = scanResult + .Select(t => new ProcessorStat(t.ProcessorName, taggedWords, t.GetWords())) + .ToArray(); + } + + + public IEnumerable> ToTable() + { + // Title + yield return Stats.Select(s => s.ProcessorName).Prepend("Reference"); + + // Spacer + yield return Stats.Select(s => "---").Prepend("---"); + + // Content + foreach (var reference in Reference) + { + yield return Stats.SelectMany(s => s.ToRow(reference)).Prepend(reference); + } + + // Spacer + yield return Stats.Select(s => "---").Prepend("---"); + + // Summaries + yield return Stats + .Select(s => s.CharacterStats.Average(s => s.CharacterError).ToString("F2")) + .Prepend("CER (avg)"); + yield return Stats + .Select(s => s.CharacterStats.Sum(s => s.CharacterError).ToString("F2")) + .Prepend("CER (sum)"); + yield return Stats + .Select(s => s.WordError.ToString("F2")) + .Prepend("WER"); + } +} diff --git a/Examples/ReportGenerator/Models/ProcessorStat.cs b/Examples/ReportGenerator/Models/ProcessorStat.cs new file mode 100644 index 0000000..be81390 --- /dev/null +++ b/Examples/ReportGenerator/Models/ProcessorStat.cs @@ -0,0 +1,85 @@ +namespace ReportGenerator.Models; + +internal readonly struct ProcessorStat +{ + public string ProcessorName { get; } = string.Empty; + public ICollection CharacterStats { get; } = Array.Empty(); + public double WordError { get; } = double.PositiveInfinity; + + public ProcessorStat( + string processorName, + ICollection reference, + ICollection values + ) + { + ProcessorName = processorName; + + WordError = CalculateWer( + reference, + values + ); + + CharacterStats = GetCharacterStat( + reference, + values + ).ToArray(); + } + + public IEnumerable ToRow(string word) => CharacterStats + .Where(s => string.Equals(s.Reference, word)) + .Select(s => s.ToString()); + + /// + /// Finds the smallest possible CER by calculating the levenshtein + /// distance to every word and returning the most similar combination + /// + /// + private static IEnumerable GetCharacterStat( + IEnumerable reference, + ICollection values + ) + { + foreach (var refValue in reference) + { + CharacterStats result = new CharacterStats(refValue); + + foreach (var value in values) + { + var stat = new CharacterStats(refValue, value); + if (stat.CharacterError > result.CharacterError) + { + continue; + } + + result = stat; + + if (stat.CharacterError == 0) + { + break; + } + } + + yield return result; + } + } + + static double CalculateWer(ICollection expected, ICollection actual) + { + // Amount of words that need to be substituted to match the original + int substitutions = expected + .Zip( + actual, + (e, a) => string.Equals(e, a) ? 0 : 1 + ) + .Sum(); + + // todo this isn't correct i think + // Amount of words dropped from the original + int deletions = expected.Except(actual).Count(); + + // Amount of extra words added compared to the original + int insertions = actual.Except(expected).Count(); + + return (substitutions + deletions + insertions) / (double)expected.Count; + } +} diff --git a/Examples/ReportGenerator/ScannedResultInfo.cs b/Examples/ReportGenerator/Models/ScannedResultInfo.cs similarity index 96% rename from Examples/ReportGenerator/ScannedResultInfo.cs rename to Examples/ReportGenerator/Models/ScannedResultInfo.cs index f83aa9b..63f7e28 100644 --- a/Examples/ReportGenerator/ScannedResultInfo.cs +++ b/Examples/ReportGenerator/Models/ScannedResultInfo.cs @@ -1,7 +1,7 @@ using System.Text.Json; using System.Text.RegularExpressions; -namespace ReportGenerator; +namespace ReportGenerator.Models; internal struct ScannedResultInfo { diff --git a/Examples/ReportGenerator/Models/TableInfo.cs b/Examples/ReportGenerator/Models/TableInfo.cs new file mode 100644 index 0000000..9758f81 --- /dev/null +++ b/Examples/ReportGenerator/Models/TableInfo.cs @@ -0,0 +1,49 @@ +namespace ReportGenerator.Models; + +internal readonly struct TableInfo +{ + public IEnumerable> Rows { get; } = Enumerable.Empty>(); + + public string Title { get; init; } = string.Empty; + + public string RowStart { get; init; } = string.Empty; + public string RowEnd { get; init; } = string.Empty; + + public string ColumnStart { get; init; } = string.Empty; + public string ColumnEnd { get; init; } = string.Empty; + + public TableInfo(IEnumerable> rows) + { + Rows = rows; + } + + #region Overrides of ValueType + + /// + public override string ToString() + { + string result = string.Empty; + + // Title + result += Title; + + // Body + foreach (var row in Rows) + { + result += RowStart; + + foreach (var column in row) + { + result += ColumnStart; + result += column; + result += ColumnEnd; + } + + result += RowEnd; + } + + return result; + } + + #endregion +} \ No newline at end of file diff --git a/Examples/ReportGenerator/Models/TagFileInfo.cs b/Examples/ReportGenerator/Models/TagFileInfo.cs new file mode 100644 index 0000000..390842f --- /dev/null +++ b/Examples/ReportGenerator/Models/TagFileInfo.cs @@ -0,0 +1,31 @@ +using System.Text.Json; + +namespace ReportGenerator.Models; + +internal struct TagFileInfo +{ + public string Path { get; private init; } + + public string ImageName { get; set; } + + public ICollection GetWords() + { + using var file = File.OpenRead(Path); + return JsonDocument + .Parse(file) + .RootElement + .GetProperty("words") + .EnumerateArray() + .Select(w => w.GetString() ?? throw new Exception("Cannot parse null words")) + .ToArray(); + } + + public static TagFileInfo FromPath(string path) => new() + { + Path = path, + ImageName = System.IO.Path.GetFileNameWithoutExtension(path), + }; + + /// + public override string ToString() => ImageName; +} diff --git a/Examples/ReportGenerator/Program.cs b/Examples/ReportGenerator/Program.cs index 1fd2ccb..1c9782b 100644 --- a/Examples/ReportGenerator/Program.cs +++ b/Examples/ReportGenerator/Program.cs @@ -1,167 +1,52 @@ -namespace ReportGenerator; +using ReportGenerator.Models; -internal struct CharacterErrorInfo -{ - public string TaggedWord { get; } - public string? ScannedWord { get; set; } = null; - public double CharacterError { get; set; } = double.PositiveInfinity; - - public CharacterErrorInfo(string taggedWord) => TaggedWord = taggedWord; -} - -internal struct WordErrorInfo -{ - public double WordError { get; set; } = double.PositiveInfinity; - - public ICollection Words { get; } = new List(); - - public double CharacterErrorAvg => Words.Average(i => i.CharacterError); - - public WordErrorInfo() - { - } -} - -internal struct ScanTable -{ - public string ImageName { get; set; } - - public ICollection Scans { get; set; } -} - -internal struct ScanTableRow -{ - public string ScannerName { get; set; } - - public ICollection ScannedWords { get; set; } -} +namespace ReportGenerator; internal static class Program { internal static void Main(string[] args) { - var errorInfos = new List<(ScannedResultInfo scan, WordErrorInfo error)>(); - var tagFileInfos = GetTagFileInfos(args[0]); - var scanFileInfos = GetScanFileInfos(args[1]).ToLookup(i => i.ImageName); + var scanFileInfos = GetScanFileInfos(args[1]); Directory.CreateDirectory("reports"); - foreach (var tagFileInfo in tagFileInfos) - { - var taggedWords = tagFileInfo.GetWords(); + var stats = Scan(tagFileInfos, scanFileInfos); - foreach (var scanFileInfo in scanFileInfos[tagFileInfo.ImageName]) + foreach (var stat in stats) + { + var tableFields = stat.ToTable(); + var tableInfo = new TableInfo(tableFields) { - var scannedWords = scanFileInfo.GetWords(); - if (!scannedWords.Any()) - { - continue; - } + Title = stat.ImageName + Environment.NewLine, + RowStart = " | ", + RowEnd = Environment.NewLine, + ColumnEnd = " | " + }; - // Calculate WER by comparing all tagged with all scanned words - var wordErrorInfo = new WordErrorInfo - { - WordError = CalculateWer(taggedWords, scannedWords), - }; + var tableStr = tableInfo.ToString(); - // Calculate CER for each tagged word - foreach (var taggedWord in taggedWords) - { - var characterErrorInfo = new CharacterErrorInfo(taggedWord); - - foreach (var scannedWord in scannedWords) - { - // Calculates the levenshtein distance to every word and returns the most similar combination - var err = CalculateCer(taggedWord, scannedWord); - - if (err < characterErrorInfo.CharacterError) - { - characterErrorInfo.ScannedWord = scannedWord; - characterErrorInfo.CharacterError = err; - - if (err == 0) - { - break; - } - } - } - - wordErrorInfo.Words.Add(characterErrorInfo); - } - - errorInfos.Add((scanFileInfo, wordErrorInfo)); - } + Console.WriteLine(); + Console.WriteLine(); + Console.WriteLine(tableStr); + Console.WriteLine(); + Console.WriteLine(); } - - // Somewhat off based on the amount of expected words - // If a processor did scan nothing at all this value can be very low - var bestCharErrorProcessor = errorInfos - .GroupBy(e => e.scan.ProcessorName, e => e.error) - .Select(g => (g.Key, g.Average(i => i.CharacterErrorAvg))) - .OrderBy(g => g.Item2) - .ToArray(); - - // Same here but with less impact - var bestWordErrorProcessor = errorInfos - .GroupBy(e => e.scan.ProcessorName, e => e.error) - .Select(g => (g.Key, g.Average(i => i.WordError))) - .OrderBy(g => g.Item2) - .ToArray(); } - static double CalculateCer(string s1, string s2) + private static IEnumerable Scan( + IEnumerable tagFileInfos, + IEnumerable scanFileInfos + ) { - if (string.IsNullOrEmpty(s1) || string.IsNullOrEmpty(s2)) - { - return 0; - } - - var distance = new int[s1.Length + 1, s2.Length + 1]; - - for (var i = 0; i <= s1.Length; i++) - { - distance[i, 0] = i; - } - - for (var j = 0; j <= s2.Length; j++) - { - distance[0, j] = j; - } - - for (var i = 1; i <= s1.Length; i++) - { - for (var j = 1; j <= s2.Length; j++) - { - var cost = s2[j - 1] == s1[i - 1] ? 0 : 1; - - var c1 = Math.Min(distance[i - 1, j] + 1, distance[i, j - 1] + 1); - var c2 = distance[i - 1, j - 1] + cost; - distance[i, j] = Math.Min(c1, c2); - } - } - - return distance[s1.Length, s2.Length]; + var scanFileLookup = scanFileInfos.ToLookup(i => i.ImageName); + return tagFileInfos.Select(i => new ImageStats( + i.ImageName, + i.GetWords(), + scanFileLookup[i.ImageName] + )); } - static double CalculateWer(ICollection expected, ICollection actual) - { - // Amount of words that need to be substituted to match the original - int substitutions = expected - .Zip( - actual, - (e, a) => string.Equals(e, a) ? 0 : 1 - ) - .Sum(); - - // Amount of words dropped from the original - int deletions = expected.Except(actual).Count(); - - // Amount of extra words added compared to the original - int insertions = actual.Except(expected).Count(); - - return (substitutions + deletions + insertions) / (double)expected.Count; - } private static IEnumerable GetTagFileInfos(string dir) { diff --git a/Examples/ReportGenerator/ReportGenerator.csproj b/Examples/ReportGenerator/ReportGenerator.csproj index 74abf5c..8c735f6 100644 --- a/Examples/ReportGenerator/ReportGenerator.csproj +++ b/Examples/ReportGenerator/ReportGenerator.csproj @@ -7,4 +7,8 @@ enable + + + + diff --git a/Examples/ReportGenerator/TagFileInfo.cs b/Examples/ReportGenerator/TagFileInfo.cs deleted file mode 100644 index 14a1d1b..0000000 --- a/Examples/ReportGenerator/TagFileInfo.cs +++ /dev/null @@ -1,31 +0,0 @@ -using System.Text.Json; - -namespace ReportGenerator; - -internal struct TagFileInfo -{ - public string Path { get; private init; } - - public string ImageName { get; set; } - - public ICollection GetWords() - { - using var file = File.OpenRead(Path); - return JsonDocument - .Parse(file) - .RootElement - .GetProperty("words") - .EnumerateArray() - .Select(w => w.GetString() ?? throw new Exception("Cannot parse null words")) - .ToArray(); - } - - public static TagFileInfo FromPath(string path) => new() - { - Path = path, - ImageName = System.IO.Path.GetFileNameWithoutExtension(path), - }; - - /// - public override string ToString() => ImageName; -}