186 lines
4.9 KiB
C#
186 lines
4.9 KiB
C#
namespace ReportGenerator;
|
|
|
|
internal struct CharacterErrorInfo
|
|
{
|
|
public string TaggedWord { get; }
|
|
public string? ScannedWord { get; set; } = null;
|
|
public double CharacterError { get; set; } = double.PositiveInfinity;
|
|
|
|
public CharacterErrorInfo(string taggedWord) => TaggedWord = taggedWord;
|
|
}
|
|
|
|
internal struct WordErrorInfo
|
|
{
|
|
public double WordError { get; set; } = double.PositiveInfinity;
|
|
|
|
public ICollection<CharacterErrorInfo> Words { get; } = new List<CharacterErrorInfo>();
|
|
|
|
public double CharacterErrorAvg => Words.Average(i => i.CharacterError);
|
|
|
|
public WordErrorInfo()
|
|
{
|
|
}
|
|
}
|
|
|
|
internal struct ScanTable
|
|
{
|
|
public string ImageName { get; set; }
|
|
|
|
public ICollection<ScanTableRow> Scans { get; set; }
|
|
}
|
|
|
|
internal struct ScanTableRow
|
|
{
|
|
public string ScannerName { get; set; }
|
|
|
|
public ICollection<string> ScannedWords { get; set; }
|
|
}
|
|
|
|
internal static class Program
|
|
{
|
|
internal static void Main(string[] args)
|
|
{
|
|
var errorInfos = new List<(ScannedResultInfo scan, WordErrorInfo error)>();
|
|
|
|
var tagFileInfos = GetTagFileInfos(args[0]);
|
|
var scanFileInfos = GetScanFileInfos(args[1]).ToLookup(i => i.ImageName);
|
|
|
|
Directory.CreateDirectory("reports");
|
|
|
|
foreach (var tagFileInfo in tagFileInfos)
|
|
{
|
|
var taggedWords = tagFileInfo.GetWords();
|
|
|
|
foreach (var scanFileInfo in scanFileInfos[tagFileInfo.ImageName])
|
|
{
|
|
var scannedWords = scanFileInfo.GetWords();
|
|
if (!scannedWords.Any())
|
|
{
|
|
continue;
|
|
}
|
|
|
|
// Calculate WER by comparing all tagged with all scanned words
|
|
var wordErrorInfo = new WordErrorInfo
|
|
{
|
|
WordError = CalculateWer(taggedWords, scannedWords),
|
|
};
|
|
|
|
// Calculate CER for each tagged word
|
|
foreach (var taggedWord in taggedWords)
|
|
{
|
|
var characterErrorInfo = new CharacterErrorInfo(taggedWord);
|
|
|
|
foreach (var scannedWord in scannedWords)
|
|
{
|
|
// Calculates the levenshtein distance to every word and returns the most similar combination
|
|
var err = CalculateCer(taggedWord, scannedWord);
|
|
|
|
if (err < characterErrorInfo.CharacterError)
|
|
{
|
|
characterErrorInfo.ScannedWord = scannedWord;
|
|
characterErrorInfo.CharacterError = err;
|
|
|
|
if (err == 0)
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
wordErrorInfo.Words.Add(characterErrorInfo);
|
|
}
|
|
|
|
errorInfos.Add((scanFileInfo, wordErrorInfo));
|
|
}
|
|
}
|
|
|
|
// Somewhat off based on the amount of expected words
|
|
// If a processor did scan nothing at all this value can be very low
|
|
var bestCharErrorProcessor = errorInfos
|
|
.GroupBy(e => e.scan.ProcessorName, e => e.error)
|
|
.Select(g => (g.Key, g.Average(i => i.CharacterErrorAvg)))
|
|
.OrderBy(g => g.Item2)
|
|
.ToArray();
|
|
|
|
// Same here but with less impact
|
|
var bestWordErrorProcessor = errorInfos
|
|
.GroupBy(e => e.scan.ProcessorName, e => e.error)
|
|
.Select(g => (g.Key, g.Average(i => i.WordError)))
|
|
.OrderBy(g => g.Item2)
|
|
.ToArray();
|
|
}
|
|
|
|
static double CalculateCer(string s1, string s2)
|
|
{
|
|
if (string.IsNullOrEmpty(s1) || string.IsNullOrEmpty(s2))
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
var distance = new int[s1.Length + 1, s2.Length + 1];
|
|
|
|
for (var i = 0; i <= s1.Length; i++)
|
|
{
|
|
distance[i, 0] = i;
|
|
}
|
|
|
|
for (var j = 0; j <= s2.Length; j++)
|
|
{
|
|
distance[0, j] = j;
|
|
}
|
|
|
|
for (var i = 1; i <= s1.Length; i++)
|
|
{
|
|
for (var j = 1; j <= s2.Length; j++)
|
|
{
|
|
var cost = s2[j - 1] == s1[i - 1] ? 0 : 1;
|
|
|
|
var c1 = Math.Min(distance[i - 1, j] + 1, distance[i, j - 1] + 1);
|
|
var c2 = distance[i - 1, j - 1] + cost;
|
|
distance[i, j] = Math.Min(c1, c2);
|
|
}
|
|
}
|
|
|
|
return distance[s1.Length, s2.Length];
|
|
}
|
|
|
|
static double CalculateWer(ICollection<string> expected, ICollection<string> actual)
|
|
{
|
|
// Amount of words that need to be substituted to match the original
|
|
int substitutions = expected
|
|
.Zip(
|
|
actual,
|
|
(e, a) => string.Equals(e, a) ? 0 : 1
|
|
)
|
|
.Sum();
|
|
|
|
// Amount of words dropped from the original
|
|
int deletions = expected.Except(actual).Count();
|
|
|
|
// Amount of extra words added compared to the original
|
|
int insertions = actual.Except(expected).Count();
|
|
|
|
return (substitutions + deletions + insertions) / (double)expected.Count;
|
|
}
|
|
|
|
private static IEnumerable<TagFileInfo> GetTagFileInfos(string dir)
|
|
{
|
|
if (!Directory.Exists(dir))
|
|
{
|
|
throw new ArgumentException($"Invalid tagged data directory '{dir}'");
|
|
}
|
|
|
|
return Directory.EnumerateFiles(dir, "*.json").Select(TagFileInfo.FromPath);
|
|
}
|
|
|
|
private static IEnumerable<ScannedResultInfo> GetScanFileInfos(string dir)
|
|
{
|
|
if (!Directory.Exists(dir))
|
|
{
|
|
throw new ArgumentException($"Invalid scan results directory '{dir}'");
|
|
}
|
|
|
|
return Directory.EnumerateFiles(dir, "*.json").Select(ScannedResultInfo.FromPath);
|
|
}
|
|
}
|