Skip to content
42 changes: 42 additions & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
name: Test

on: [push, pull_request]

jobs:
test:
timeout-minutes: 10
runs-on: ${{ matrix.os }}
env:
PYTHONIOENCODING: "utf8"
strategy:
matrix:
python-version: [3.8, 3.9, 3.10.x, 3.11, 3.12]
os: [ubuntu-latest, windows-latest]

steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v1
with:
python-version: ${{ matrix.python-version }}
- name: Install system dependencies (Linux)
if: runner.os == 'Linux'
run: |
sudo apt update
sudo apt install tesseract-ocr poppler-utils imagemagick ghostscript
pip install -U ocrmypdf

- name: Install system dependencies (Windows)
if: runner.os == 'Windows'
run: |
choco install --yes --no-progress --pre tesseract
refreshenv
choco install --yes --no-progress --ignore-checksums ghostscript poppler imagemagick
pip install -U ocrmypdf

- name: Install testing dependencies
run: |
pip install -U wheel pip
pip install --editable ".[test]"
- name: Test with pytest
run: pytest
18 changes: 12 additions & 6 deletions src/invoice2data/input/tesseract.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
# -*- coding: utf-8 -*-
"""Tesseract OCR input module for invoice2data."""

import mimetypes
import os
import platform
import shutil
import tempfile
from logging import getLogger
Expand Down Expand Up @@ -40,17 +42,21 @@ def to_text(path: str, area_details: Optional[Dict[str, Any]] = None) -> str:
if not os.path.exists(path):
raise FileNotFoundError(f"File not found: {path}")
# Check for dependencies. Needs Tesseract and Imagemagick installed.
current_platform = platform.platform()
if current_platform.startswith("win32"):
convert_command_prefix = "magick"
else:
convert_command_prefix = "convert"
if not shutil.which("tesseract"):
raise OSError("tesseract not installed.")
if not shutil.which("convert"):
raise OSError("imagemagick not installed.")
raise EnvironmentError("tesseract not installed.")
if not shutil.which(convert_command_prefix):
raise EnvironmentError("imagemagick not installed.")

language = get_languages()
logger.debug("tesseract language arg is, %s", language)
timeout = 180

# convert the (multi-page) pdf file to a 300dpi png
convert = [
convert = [convert_command_prefix] + [
"convert",
"-units",
"PixelsPerInch",
Expand Down Expand Up @@ -188,4 +194,4 @@ def lang_error(output: str) -> str:
raise OSError(lang_error(output))
_header, *rest = output.splitlines()
langlist: Set[str] = {lang.strip() for lang in rest}
return "+".join(map(str, langlist))
return "+".join(map(str, langlist))
50 changes: 37 additions & 13 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,31 @@
# -*- coding: utf-8 -*-
# Run: python -m unittest tests.test_cli

# Or: python -m unittest discover

# 1. You define your own class derived from unittest.TestCase.
# 2. Then you fill it with functions that start with 'test_'
# 3. You run the tests by placing unittest.main() in your file,
# usually at the bottom.

# https://docs.python.org/3.10/library/unittest.html#test-cases

import csv
import datetime
import json
import os
import shutil
import sys
import unittest
from typing import Any
from typing import Dict
from xml.dom import minidom

from invoice2data.__main__ import main # Import main only
import pytest
from invoice2data.__main__ import main
from invoice2data.extract.loader import read_templates

from .common import exclude_template
from .common import get_sample_files
from .common import inputparser_specific
from .common import exclude_template, get_sample_files, inputparser_specific


def ocrmypdf_available() -> bool:
Expand All @@ -25,6 +37,10 @@ def ocrmypdf_available() -> bool:


needs_ocrmypdf = unittest.skipIf(not ocrmypdf_available(), reason="requires ocrmypdf")
skip_on_windows = pytest.mark.skipif(
sys.platform.startswith("win"),
reason="Tesseract executable cannot be found in Windows test environment. FIXME",
)


class TestCLI(unittest.TestCase):
Expand Down Expand Up @@ -80,10 +96,10 @@ def test_debug(self) -> None:
# TODO: move result comparison to own test module.
# TODO: parse output files instead of comparing them byte-by-byte.

def test_content_json(self) -> None:
@skip_on_windows
def test_content_json(self):
"""Tests the JSON output content."""
input_files = get_sample_files(".pdf")
input_files += get_sample_files(".txt")
input_files = get_sample_files((".pdf", ".txt"))
tests_templ_folder = "./tests/custom/templates"
json_files = get_sample_files(".json")
test_files = "test_compare.json"
Expand Down Expand Up @@ -111,7 +127,8 @@ def test_content_json(self) -> None:
)
os.remove(test_files)

def test_output_format_date_json(self) -> None:
@skip_on_windows
def test_output_format_date_json(self):
"""Tests the date format in JSON output."""
pdf_files = get_sample_files("free_fiber.pdf")
test_file = "test_compare.json"
Expand Down Expand Up @@ -168,7 +185,8 @@ def test_output_format_date_csv(self) -> None:
self.assertTrue(False, "Unexpected date format")
os.remove(test_file)

def test_output_format_date_xml(self) -> None:
@skip_on_windows
def test_output_format_date_xml(self):
"""Tests the date format in XML output."""
pdf_files = get_sample_files("free_fiber.pdf")
test_file = "test_compare.xml"
Expand All @@ -195,7 +213,8 @@ def test_output_format_date_xml(self) -> None:
self.assertTrue(False, "Unexpected date format")
os.remove(test_file)

def test_copy(self) -> None:
@skip_on_windows
def test_copy(self):
"""Tests the --copy argument."""
directory = os.path.dirname("tests/copy_test/pdf/")
# make sure directory is deleted
Expand Down Expand Up @@ -279,7 +298,8 @@ def get_filename_format_test_data(self, filename_format: str) -> Dict[str, Any]:
)
return data

def test_copy_with_default_filename_format(self) -> None:
@skip_on_windows
def test_copy_with_default_filename_format(self):
"""Tests the --copy argument with the default filename format."""
copy_dir = os.path.join("tests", "copy_test", "pdf")
# make sure directory is deleted
Expand Down Expand Up @@ -311,7 +331,8 @@ def test_copy_with_default_filename_format(self) -> None:

shutil.rmtree(os.path.dirname(copy_dir), ignore_errors=True)

def test_copy_with_custom_filename_format(self) -> None:
@skip_on_windows
def test_copy_with_custom_filename_format(self):
"""Tests the --copy argument with a custom filename format."""
copy_dir = os.path.join("tests", "copy_test", "pdf")
filename_format = "Custom Prefix {date} {invoice_number}.pdf"
Expand Down Expand Up @@ -343,7 +364,8 @@ def test_copy_with_custom_filename_format(self) -> None:

shutil.rmtree(os.path.dirname(copy_dir), ignore_errors=True)

def test_area(self) -> None:
@skip_on_windows
def test_area(self):
"""Tests the --area argument."""
pdf_files = get_sample_files("NetpresseInvoice.pdf")
test_file = "test_area.json"
Expand Down Expand Up @@ -372,6 +394,7 @@ def test_area(self) -> None:
# Where the pdf has to be ocr'd first
# before any keywords can be matched

@skip_on_windows
@needs_ocrmypdf
def test_ocrmypdf(self) -> None:
"""Tests the ocrmypdf input reader."""
Expand Down Expand Up @@ -403,6 +426,7 @@ def test_ocrmypdf(self) -> None:
# Test the fallback from pdf to text to ocrmypdf.
# with ocrmypdf installed

@skip_on_windows
@needs_ocrmypdf
def test_fallback_with_ocrmypdf(self) -> None:
"""Tests the fallback from pdftotext to ocrmypdf."""
Expand Down
16 changes: 10 additions & 6 deletions tests/test_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,9 @@

# https://docs.python.org/3.10/library/unittest.html#test-cases


import os
import sys
import pytest
import unittest
from io import StringIO # noqa: F401
from typing import Any
Expand Down Expand Up @@ -40,8 +41,10 @@ def have_pdfplumber() -> bool:
return True


needs_pdfplumber = unittest.skipIf(
not have_pdfplumber(), reason="requires pdfplumber\n"
needs_pdfplumber = unittest.skipIf(not have_pdfplumber(), reason="requires pdfplumber\n")
skip_on_windows = pytest.mark.skipif(
sys.platform.startswith("win"),
reason="Tesseract executable cannot be found in Windows test environment. FIXME",
)


Expand Down Expand Up @@ -124,8 +127,9 @@ def test_extract_data_pdfplumber(self) -> None:
print("Testing pdfplumber with file", file)
extract_data(file, [], pdfplumber)

def test_tesseract_for_return(self) -> None:
png_files = get_sample_files(".png")
@skip_on_windows
def test_tesseract_for_return(self):
png_files = get_sample_files('.png')
for file in png_files:
if tesseract.to_text(file) is None:
self.assertTrue(False, "Tesseract returned None")
Expand All @@ -146,4 +150,4 @@ def test_haveocrmypdf_available(self) -> None:


if __name__ == "__main__":
unittest.main()
unittest.main()
Loading