43 lines
1.1 KiB
C#
43 lines
1.1 KiB
C#
using Ocr.Tesseract.Models;
|
|
using Process.Abstract;
|
|
using System.Collections.Generic;
|
|
using System.Linq;
|
|
|
|
namespace Ocr.Tesseract;
|
|
|
|
/// <summary>
|
|
/// Filters duplicate <see cref="Word"/>s
|
|
/// </summary>
|
|
public class DuplicateFilter
|
|
: Processor<ScanResult, ScanResult>
|
|
{
|
|
#region Overrides of Processor<KeyValuePair<Word,MagickImage>,KeyValuePair<Word,MagickImage>>
|
|
|
|
/// <inheritdoc />
|
|
public override IEnumerable<ScanResult> Process(
|
|
IEnumerable<ScanResult> inputs
|
|
)
|
|
{
|
|
return inputs
|
|
.GroupBy(sr => sr.Word.Text)
|
|
.Select(DuplicateSelector)
|
|
.OrderByDescending(w => w.Word.Confidence);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Selects the instance to keep, if duplicates are detected in the input data
|
|
/// </summary>
|
|
/// <param name="g">
|
|
/// <see cref="IGrouping{TKey,TElement}"/>
|
|
/// containing the duplicate instances
|
|
/// </param>
|
|
/// <returns>One single instance to add to the output data</returns>
|
|
protected virtual ScanResult DuplicateSelector(IGrouping<string, ScanResult> g)
|
|
{
|
|
// Default: Return instance with the highest confidence
|
|
return g.MaxBy(sr => sr.Word.Confidence)!;
|
|
}
|
|
|
|
#endregion
|
|
}
|