adjusted namespaces and made separate data dir
This commit is contained in:
@@ -2,44 +2,43 @@
|
||||
using System.Collections.Generic;
|
||||
using Tesseract;
|
||||
|
||||
namespace Ocr.Tesseract.Extensions
|
||||
namespace Ocr.Tesseract.Extensions;
|
||||
|
||||
/// <summary>
|
||||
/// Extensions for the <see cref="Page"/> type
|
||||
/// </summary>
|
||||
public static class PageExtensions
|
||||
{
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Extensions for the <see cref="Page"/> type
|
||||
/// Retrieves the <see cref="Word"/>s in the given <paramref name="page"/>
|
||||
/// </summary>
|
||||
public static class PageExtensions
|
||||
/// <param name="page">The <see cref="Page"/> to extract words from</param>
|
||||
/// <returns>The extracted <see cref="Word"/>s in the given <paramref name="page"/></returns>
|
||||
public static IEnumerable<Word> GetWords(this Page page)
|
||||
{
|
||||
using var iterator = page.GetIterator();
|
||||
iterator.Begin();
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Retrieves the <see cref="Word"/>s in the given <paramref name="page"/>
|
||||
/// </summary>
|
||||
/// <param name="page">The <see cref="Page"/> to extract words from</param>
|
||||
/// <returns>The extracted <see cref="Word"/>s in the given <paramref name="page"/></returns>
|
||||
public static IEnumerable<Word> GetWords(this Page page)
|
||||
do
|
||||
{
|
||||
using var iterator = page.GetIterator();
|
||||
iterator.Begin();
|
||||
|
||||
do
|
||||
{
|
||||
do
|
||||
{
|
||||
do
|
||||
{
|
||||
do
|
||||
var word = Word.Parse(iterator);
|
||||
if (string.IsNullOrEmpty(word.Text))
|
||||
{
|
||||
var word = Word.Parse(iterator);
|
||||
if (string.IsNullOrEmpty(word.Text))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
yield return word;
|
||||
} while (iterator.Next(PageIteratorLevel.TextLine, PageIteratorLevel.Word));
|
||||
} while (iterator.Next(PageIteratorLevel.Para, PageIteratorLevel.TextLine));
|
||||
} while (iterator.Next(PageIteratorLevel.Block, PageIteratorLevel.Para));
|
||||
} while (iterator.Next(PageIteratorLevel.Block));
|
||||
}
|
||||
yield return word;
|
||||
} while (iterator.Next(PageIteratorLevel.TextLine, PageIteratorLevel.Word));
|
||||
} while (iterator.Next(PageIteratorLevel.Para, PageIteratorLevel.TextLine));
|
||||
} while (iterator.Next(PageIteratorLevel.Block, PageIteratorLevel.Para));
|
||||
} while (iterator.Next(PageIteratorLevel.Block));
|
||||
}
|
||||
}
|
||||
@@ -7,61 +7,60 @@ using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using Tesseract;
|
||||
|
||||
namespace Ocr.Tesseract
|
||||
namespace Ocr.Tesseract;
|
||||
|
||||
/// <summary>
|
||||
/// Scans <see cref="MagickImage"/>s for <see cref="Word"/>s
|
||||
/// and maps the results to a <see cref="ScanResult"/>
|
||||
/// </summary>
|
||||
public class TesseractProcessor : Processor<MagickImage, ScanResult>
|
||||
{
|
||||
/// <summary>
|
||||
/// Scans <see cref="MagickImage"/>s for <see cref="Word"/>s
|
||||
/// and maps the results to a <see cref="ScanResult"/>
|
||||
/// </summary>
|
||||
public class TesseractProcessor : Processor<MagickImage, ScanResult>
|
||||
/// <inheritdoc cref="ITesseractConfiguration"/>
|
||||
public ITesseractConfiguration Configuration { get; }
|
||||
|
||||
/// <inheritdoc />
|
||||
public TesseractProcessor(ITesseractConfiguration config)
|
||||
{
|
||||
/// <inheritdoc cref="ITesseractConfiguration"/>
|
||||
public ITesseractConfiguration Configuration { get; }
|
||||
Configuration = config;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public TesseractProcessor(ITesseractConfiguration config)
|
||||
{
|
||||
Configuration = config;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Scans the provided <paramref name="image"/> for <see cref="Word"/>s
|
||||
/// </summary>
|
||||
/// <param name="image">The <see cref="MagickImage"/> to scan</param>
|
||||
/// <returns>
|
||||
/// A list of <see cref="Word"/>s found
|
||||
/// in the provided <paramref name="image"/>
|
||||
/// </returns>
|
||||
private IEnumerable<Word> Scan(MagickImage image)
|
||||
{
|
||||
// Convert image
|
||||
using var pix = PixConverter.ToPix(image.ToBitmapWithDensity());
|
||||
using var engine = new TesseractEngine(
|
||||
Configuration.DataPath,
|
||||
string.Join('+', Configuration.Languages),
|
||||
EngineMode.Default,
|
||||
Enumerable.Empty<string>(),
|
||||
Configuration.Variables,
|
||||
false
|
||||
)
|
||||
{
|
||||
DefaultPageSegMode = PageSegMode.AutoOsd
|
||||
};
|
||||
|
||||
// Scan
|
||||
return engine
|
||||
.Process(pix)
|
||||
.GetWords()
|
||||
.ToArray();
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public override IEnumerable<ScanResult> Process(
|
||||
IEnumerable<MagickImage> inputs
|
||||
/// <summary>
|
||||
/// Scans the provided <paramref name="image"/> for <see cref="Word"/>s
|
||||
/// </summary>
|
||||
/// <param name="image">The <see cref="MagickImage"/> to scan</param>
|
||||
/// <returns>
|
||||
/// A list of <see cref="Word"/>s found
|
||||
/// in the provided <paramref name="image"/>
|
||||
/// </returns>
|
||||
private IEnumerable<Word> Scan(MagickImage image)
|
||||
{
|
||||
// Convert image
|
||||
using var pix = PixConverter.ToPix(image.ToBitmapWithDensity());
|
||||
using var engine = new TesseractEngine(
|
||||
Configuration.DataPath,
|
||||
string.Join('+', Configuration.Languages),
|
||||
EngineMode.Default,
|
||||
Enumerable.Empty<string>(),
|
||||
Configuration.Variables,
|
||||
false
|
||||
)
|
||||
{
|
||||
return inputs
|
||||
.SelectMany(Scan, (input, word) => new ScanResult(word, input));
|
||||
}
|
||||
DefaultPageSegMode = PageSegMode.AutoOsd
|
||||
};
|
||||
|
||||
// Scan
|
||||
return engine
|
||||
.Process(pix)
|
||||
.GetWords()
|
||||
.ToArray();
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public override IEnumerable<ScanResult> Process(
|
||||
IEnumerable<MagickImage> inputs
|
||||
)
|
||||
{
|
||||
return inputs
|
||||
.SelectMany(Scan, (input, word) => new ScanResult(word, input));
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user