This repository has been archived on 2024-06-04. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
thesis-src/Examples/ReportGenerator/Program.cs
T
Simon Gruber d8008e4f05 a
2023-11-20 07:43:12 +01:00

186 lines
4.9 KiB
C#

namespace ReportGenerator;
internal struct CharacterErrorInfo
{
public string TaggedWord { get; }
public string? ScannedWord { get; set; } = null;
public double CharacterError { get; set; } = double.PositiveInfinity;
public CharacterErrorInfo(string taggedWord) => TaggedWord = taggedWord;
}
internal struct WordErrorInfo
{
public double WordError { get; set; } = double.PositiveInfinity;
public ICollection<CharacterErrorInfo> Words { get; } = new List<CharacterErrorInfo>();
public double CharacterErrorAvg => Words.Average(i => i.CharacterError);
public WordErrorInfo()
{
}
}
internal struct ScanTable
{
public string ImageName { get; set; }
public ICollection<ScanTableRow> Scans { get; set; }
}
internal struct ScanTableRow
{
public string ScannerName { get; set; }
public ICollection<string> ScannedWords { get; set; }
}
internal static class Program
{
internal static void Main(string[] args)
{
var errorInfos = new List<(ScannedResultInfo scan, WordErrorInfo error)>();
var tagFileInfos = GetTagFileInfos(args[0]);
var scanFileInfos = GetScanFileInfos(args[1]).ToLookup(i => i.ImageName);
Directory.CreateDirectory("reports");
foreach (var tagFileInfo in tagFileInfos)
{
var taggedWords = tagFileInfo.GetWords();
foreach (var scanFileInfo in scanFileInfos[tagFileInfo.ImageName])
{
var scannedWords = scanFileInfo.GetWords();
if (!scannedWords.Any())
{
continue;
}
// Calculate WER by comparing all tagged with all scanned words
var wordErrorInfo = new WordErrorInfo
{
WordError = CalculateWer(taggedWords, scannedWords),
};
// Calculate CER for each tagged word
foreach (var taggedWord in taggedWords)
{
var characterErrorInfo = new CharacterErrorInfo(taggedWord);
foreach (var scannedWord in scannedWords)
{
// Calculates the levenshtein distance to every word and returns the most similar combination
var err = CalculateCer(taggedWord, scannedWord);
if (err < characterErrorInfo.CharacterError)
{
characterErrorInfo.ScannedWord = scannedWord;
characterErrorInfo.CharacterError = err;
if (err == 0)
{
break;
}
}
}
wordErrorInfo.Words.Add(characterErrorInfo);
}
errorInfos.Add((scanFileInfo, wordErrorInfo));
}
}
// Somewhat off based on the amount of expected words
// If a processor did scan nothing at all this value can be very low
var bestCharErrorProcessor = errorInfos
.GroupBy(e => e.scan.ProcessorName, e => e.error)
.Select(g => (g.Key, g.Average(i => i.CharacterErrorAvg)))
.OrderBy(g => g.Item2)
.ToArray();
// Same here but with less impact
var bestWordErrorProcessor = errorInfos
.GroupBy(e => e.scan.ProcessorName, e => e.error)
.Select(g => (g.Key, g.Average(i => i.WordError)))
.OrderBy(g => g.Item2)
.ToArray();
}
static double CalculateCer(string s1, string s2)
{
if (string.IsNullOrEmpty(s1) || string.IsNullOrEmpty(s2))
{
return 0;
}
var distance = new int[s1.Length + 1, s2.Length + 1];
for (var i = 0; i <= s1.Length; i++)
{
distance[i, 0] = i;
}
for (var j = 0; j <= s2.Length; j++)
{
distance[0, j] = j;
}
for (var i = 1; i <= s1.Length; i++)
{
for (var j = 1; j <= s2.Length; j++)
{
var cost = s2[j - 1] == s1[i - 1] ? 0 : 1;
var c1 = Math.Min(distance[i - 1, j] + 1, distance[i, j - 1] + 1);
var c2 = distance[i - 1, j - 1] + cost;
distance[i, j] = Math.Min(c1, c2);
}
}
return distance[s1.Length, s2.Length];
}
static double CalculateWer(ICollection<string> expected, ICollection<string> actual)
{
// Amount of words that need to be substituted to match the original
int substitutions = expected
.Zip(
actual,
(e, a) => string.Equals(e, a) ? 0 : 1
)
.Sum();
// Amount of words dropped from the original
int deletions = expected.Except(actual).Count();
// Amount of extra words added compared to the original
int insertions = actual.Except(expected).Count();
return (substitutions + deletions + insertions) / (double)expected.Count;
}
private static IEnumerable<TagFileInfo> GetTagFileInfos(string dir)
{
if (!Directory.Exists(dir))
{
throw new ArgumentException($"Invalid tagged data directory '{dir}'");
}
return Directory.EnumerateFiles(dir, "*.json").Select(TagFileInfo.FromPath);
}
private static IEnumerable<ScannedResultInfo> GetScanFileInfos(string dir)
{
if (!Directory.Exists(dir))
{
throw new ArgumentException($"Invalid scan results directory '{dir}'");
}
return Directory.EnumerateFiles(dir, "*.json").Select(ScannedResultInfo.FromPath);
}
}