adjusted namespaces and made separate data dir

This commit is contained in:
Simon Gruber
2023-11-22 06:51:08 +01:00
parent ea51f37f81
commit beb194c106
299 changed files with 720 additions and 647 deletions
+24 -25
View File
@@ -2,44 +2,43 @@
using System.Collections.Generic;
using Tesseract;
namespace Ocr.Tesseract.Extensions
namespace Ocr.Tesseract.Extensions;
/// <summary>
/// Extensions for the <see cref="Page"/> type
/// </summary>
public static class PageExtensions
{
/// <summary>
/// Extensions for the <see cref="Page"/> type
/// Retrieves the <see cref="Word"/>s in the given <paramref name="page"/>
/// </summary>
public static class PageExtensions
/// <param name="page">The <see cref="Page"/> to extract words from</param>
/// <returns>The extracted <see cref="Word"/>s in the given <paramref name="page"/></returns>
public static IEnumerable<Word> GetWords(this Page page)
{
using var iterator = page.GetIterator();
iterator.Begin();
/// <summary>
/// Retrieves the <see cref="Word"/>s in the given <paramref name="page"/>
/// </summary>
/// <param name="page">The <see cref="Page"/> to extract words from</param>
/// <returns>The extracted <see cref="Word"/>s in the given <paramref name="page"/></returns>
public static IEnumerable<Word> GetWords(this Page page)
do
{
using var iterator = page.GetIterator();
iterator.Begin();
do
{
do
{
do
{
do
var word = Word.Parse(iterator);
if (string.IsNullOrEmpty(word.Text))
{
var word = Word.Parse(iterator);
if (string.IsNullOrEmpty(word.Text))
{
continue;
}
continue;
}
yield return word;
} while (iterator.Next(PageIteratorLevel.TextLine, PageIteratorLevel.Word));
} while (iterator.Next(PageIteratorLevel.Para, PageIteratorLevel.TextLine));
} while (iterator.Next(PageIteratorLevel.Block, PageIteratorLevel.Para));
} while (iterator.Next(PageIteratorLevel.Block));
}
yield return word;
} while (iterator.Next(PageIteratorLevel.TextLine, PageIteratorLevel.Word));
} while (iterator.Next(PageIteratorLevel.Para, PageIteratorLevel.TextLine));
} while (iterator.Next(PageIteratorLevel.Block, PageIteratorLevel.Para));
} while (iterator.Next(PageIteratorLevel.Block));
}
}
+51 -52
View File
@@ -7,61 +7,60 @@ using System.Collections.Generic;
using System.Linq;
using Tesseract;
namespace Ocr.Tesseract
namespace Ocr.Tesseract;
/// <summary>
/// Scans <see cref="MagickImage"/>s for <see cref="Word"/>s
/// and maps the results to a <see cref="ScanResult"/>
/// </summary>
public class TesseractProcessor : Processor<MagickImage, ScanResult>
{
/// <summary>
/// Scans <see cref="MagickImage"/>s for <see cref="Word"/>s
/// and maps the results to a <see cref="ScanResult"/>
/// </summary>
public class TesseractProcessor : Processor<MagickImage, ScanResult>
/// <inheritdoc cref="ITesseractConfiguration"/>
public ITesseractConfiguration Configuration { get; }
/// <inheritdoc />
public TesseractProcessor(ITesseractConfiguration config)
{
/// <inheritdoc cref="ITesseractConfiguration"/>
public ITesseractConfiguration Configuration { get; }
Configuration = config;
}
/// <inheritdoc />
public TesseractProcessor(ITesseractConfiguration config)
{
Configuration = config;
}
/// <summary>
/// Scans the provided <paramref name="image"/> for <see cref="Word"/>s
/// </summary>
/// <param name="image">The <see cref="MagickImage"/> to scan</param>
/// <returns>
/// A list of <see cref="Word"/>s found
/// in the provided <paramref name="image"/>
/// </returns>
private IEnumerable<Word> Scan(MagickImage image)
{
// Convert image
using var pix = PixConverter.ToPix(image.ToBitmapWithDensity());
using var engine = new TesseractEngine(
Configuration.DataPath,
string.Join('+', Configuration.Languages),
EngineMode.Default,
Enumerable.Empty<string>(),
Configuration.Variables,
false
)
{
DefaultPageSegMode = PageSegMode.AutoOsd
};
// Scan
return engine
.Process(pix)
.GetWords()
.ToArray();
}
/// <inheritdoc />
public override IEnumerable<ScanResult> Process(
IEnumerable<MagickImage> inputs
/// <summary>
/// Scans the provided <paramref name="image"/> for <see cref="Word"/>s
/// </summary>
/// <param name="image">The <see cref="MagickImage"/> to scan</param>
/// <returns>
/// A list of <see cref="Word"/>s found
/// in the provided <paramref name="image"/>
/// </returns>
private IEnumerable<Word> Scan(MagickImage image)
{
// Convert image
using var pix = PixConverter.ToPix(image.ToBitmapWithDensity());
using var engine = new TesseractEngine(
Configuration.DataPath,
string.Join('+', Configuration.Languages),
EngineMode.Default,
Enumerable.Empty<string>(),
Configuration.Variables,
false
)
{
return inputs
.SelectMany(Scan, (input, word) => new ScanResult(word, input));
}
DefaultPageSegMode = PageSegMode.AutoOsd
};
// Scan
return engine
.Process(pix)
.GetWords()
.ToArray();
}
}
/// <inheritdoc />
public override IEnumerable<ScanResult> Process(
IEnumerable<MagickImage> inputs
)
{
return inputs
.SelectMany(Scan, (input, word) => new ScanResult(word, input));
}
}