This repository has been archived on 2024-06-04. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
2023-08-10 09:04:36 +02:00

43 lines
1.1 KiB
C#

using Ocr.Tesseract.Models;
using Process.Abstract;
using System.Collections.Generic;
using System.Linq;
namespace Ocr.Tesseract;
/// <summary>
/// Filters duplicate <see cref="Word"/>s
/// </summary>
public class DuplicateFilter
: Processor<ScanResult, ScanResult>
{
#region Overrides of Processor<KeyValuePair<Word,MagickImage>,KeyValuePair<Word,MagickImage>>
/// <inheritdoc />
public override IEnumerable<ScanResult> Process(
IEnumerable<ScanResult> inputs
)
{
return inputs
.GroupBy(sr => sr.Word.Text)
.Select(DuplicateSelector)
.OrderByDescending(w => w.Word.Confidence);
}
/// <summary>
/// Selects the instance to keep, if duplicates are detected in the input data
/// </summary>
/// <param name="g">
/// <see cref="IGrouping{TKey,TElement}"/>
/// containing the duplicate instances
/// </param>
/// <returns>One single instance to add to the output data</returns>
protected virtual ScanResult DuplicateSelector(IGrouping<string, ScanResult> g)
{
// Default: Return instance with the highest confidence
return g.MaxBy(sr => sr.Word.Confidence)!;
}
#endregion
}