Add Python-based tests for PDFs and rework CI

CI config now dry as a desert, with everything based around the (also improvided) Makefile(s)
parent e5a83f0e
Pipeline #92003 passed with stages
in 6 minutes and 55 seconds
# ======================================================================================
# Python (
# ======================================================================================
# Byte-compiled / optimized / DLL files
# C extensions
# Distribution / packaging
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
# Installer logs
# Unit test / coverage reports
# Translations
# Django stuff:
# Flask stuff:
# Scrapy stuff:
# Sphinx documentation
# PyBuilder
# Jupyter Notebook
# IPython
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
# PEP 582; used by e.g.
# Celery stuff
# SageMath parsed files
# Environments
# Spyder project settings
# Rope project settings
# mkdocs documentation
# mypy
# Pyre type checker
# pytype static type analyzer
# Cython debug symbols
# ======================================================================================
# LaTeX
# ======================================================================================
# Generated PDF is not suitable for Git. We get our compiled PDF via CI.
# Only ignore root level PDF, since images/vectors might be in PDF format.
# Applies to all jobs, can be overridden in each
# ======================================================================================
# ======================================================================================
# Global defaults, overwritable in each job
# ======================================================================================
# ======================================================================================
# Image from dockerhub per default. Specify full path to use a different image.
......@@ -11,7 +16,33 @@ default:
# no entrypoint or that the entrypoint is prepared to start a shell command. ")
# See also:
entrypoint: [ "" ]
entrypoint: [""]
# ======================================================================================
# ======================================================================================
# Job templates (start with period).
# Not actually run by the runner, but can be used to extend other jobs, keeping the
# config dry.
# The main logic happens here, the actual jobs only combine these as needed.
# ======================================================================================
# ======================================================================================
.make: # Core job template
# Use make as the basis for all jobs. This allows to also easily run the steps
# offline/locally, as well as transition to other CI engines. All job names have to
# correspond to targets make knows about (see Makefile).
# Note we cannot have `default: script:`, so this approach works better.
script: make ${CI_JOB_NAME}
.pdf: # Job template for PDF-producing jobs
extends: .make
stage: build
# Make sure to clean any PDFs that might have crept in via the cache, forcing a
# remake at least once (otherwise, no remake might occur).
# It would be better to exclude the PDFs from the cache in the first place. This is
# not available yet, see:
- make clean-pdf
# LaTeX and pandoc stages both provide PDFs:
# is renamed to current tag/branch:
......@@ -20,36 +51,58 @@ default:
# Return all found *.pdf-files using wildcard.
# For example, a thesis and the accompanying presentation.
- "*.pdf"
# Run job 'only' if it fulfills certain criteria (BETTER: use `rules`):
# only:
# - tags
extends: .make
stage: test
# Would be ideal to read the image tag from `pyproject.toml`, i.e. the poetry project
# file. See also:
image: python:3.7.9
# Make sure to clean any PDFs that might have crept in via the cache, forcing a
# remake at least once (otherwise, no remake might occur).
# It would be better to exclude the PDFs from the cache in the first place. This is
# not available yet, see:
- make clean-pdf
- cd tests
# Allow caching by only downloading first:
- pip download --dest=${PIP_DOWNLOAD_DIR} poetry
- pip install --find-links=${PIP_DOWNLOAD_DIR} poetry
# Make available for caching by installing to current directory:
- poetry config true
- poetry install -vv
untracked: true
# Cache is invalidated if any of these files changes, but also shared if
# these two files are equal.
- poetry.lock
- pyproject.toml
# ======================================================================================
# ======================================================================================
# Actual job definitions.
# All job names have to correspond to make targets, see .make above.
# ======================================================================================
# ======================================================================================
extends: .make
stage: .pre
- make preflight
# Preflight checks are relevant, but we are interested in how exactly later jobs
# error out if preflight fails. Therefore, allow failure.
allow_failure: true
# Do not inherit artifacts, this stage has none.
default: [image]
stage: build
- make tex
extends: .pdf
untracked: true
untracked: true
stage: build
- make README.pdf
extends: .pdf
extends: .test
needs: [] # Allows job to start immediately
extends: .test
......@@ -7,35 +7,61 @@
# The following are "special targets", see:
# A phony target: not a file, just some routine.
.PHONY: all clean mostlyclean clean-aux clean-pdf tex preflight
.PHONY: all clean mostlyclean clean.aux clean.pdf tex preflight test test-self test-pdfs help
# =====================================================================================
# Set variables, executables and their flags
# Helper tool, adjusted from:
# Allows to annotate targets with regular comments and have a summary printed by calling
# `make help`.
# =====================================================================================
# Note escaping of comment char #
FS = ":.*?\#"
help: # List available targets on this project. First one shown is the default.
@grep --extended-regexp "\w+$(FS) .*" $(MAKEFILE_LIST) | \
awk --field-separator="$(FS)" '{printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
# =====================================================================================
# Set variables, executables and their flags.
# =====================================================================================
# Common flags go here:
# Configure latexmk tool using '.latexmkrc' in project root, not in here.
LATEXMK = latexmk
PANDOC = pandoc
# For pandoc, provide dynamic metadata for the date (see below). Git short SHA works
# both in CI and locally. All other settings are in the `defaults` file.
PANDOC_FLAGS = --defaults=pandoc/defaults.yaml
# Flags depending on CI/Local go here:
# GitLab CI defines variables that we can check for. This allows us to detect if we're
# in a CI scenario.
# See also:
ifdef CI
# In container, run commands directly:
LATEXMK = latexmk
PANDOC = pandoc
# pandoc is quiet by default
PANDOC_FLAGS += --verbose
# After the run, display the relevant rules (for debugging)
LATEXMK_FLAGS += --rules
# latexmk is verbose by default
DOCKER_RUN = docker run --rm --volume ${PWD}:/tex
DOCKER_IMAGE = alexpovel/latex
LATEXMK = $(DOCKER_RUN) --entrypoint="latexmk" $(DOCKER_IMAGE)
PANDOC = $(DOCKER_RUN) --entrypoint="pandoc" $(DOCKER_IMAGE)
# No supporting Docker image available yet:
GIT_SHORT_SHA = $(shell git rev-parse --short HEAD)
# latexmk is verbose by default:
LATEXMK_FLAGS += --quiet
......@@ -56,9 +82,9 @@ tex_pdfs := $(tex_sources:.tex=.pdf)
# The name `all` is just a convention.
# Change suffix of multiple different extensions (.tex, .md), to the same suffix (.pdf).
# See also:
all: preflight tex README.pdf
all: preflight tex README.pdf test # Performs preflight checks, then builds and tests all PDFs.
# A rule for only LaTeX files:
tex: $(tex_pdfs)
tex: $(tex_pdfs) # Builds all *.tex files into PDFs.
# =====================================================================================
# Rules for file building
......@@ -77,14 +103,14 @@ tex: $(tex_pdfs)
# Just sets up an implicit rule to specify how to get from prerequisite to target,
# called whever `make` detects it needs to do so. No need to specify things manually.
%.pdf: %.tex
%.pdf: %.tex # Allows to build any PDF from a corresponding *.tex file.
$(info Running $(LATEXMK) to build $@...)
PANDOC_TEMPLATE = $(strip $(shell grep "^template:" pandoc/defaults.yaml | cut --delimiter=":" --field=2)).latex
PANDOC_TEMPLATE_DIR = /usr/share/pandoc/data/templates
%.pdf: $(PANDOC_TEMPLATE_DIR)/$(PANDOC_TEMPLATE) # Allows to build any PDF from a corresponding *.md file.
$(info Running $(PANDOC) to build $@...)
@$(PANDOC) $(PANDOC_FLAGS) --output=$@ $<
......@@ -104,7 +130,7 @@ $(PANDOC_TEMPLATE_DIR)/$(PANDOC_TEMPLATE):
# =====================================================================================
# Help users install programs required for compilation and help debug.
# =====================================================================================
preflight: # Performs checks to ensure prerequisites for compilation are met.
@echo "Checking presence of required libraries..."
@ldconfig --print-cache | grep --silent "librsvg" || \
(echo "librsvg missing: required by pandoc to convert files containing SVGs."; exit 69)
......@@ -112,21 +138,41 @@ preflight:
# Output looks like:
@$(LATEXMK) --commands
# =====================================================================================
# Testing.
# =====================================================================================
test: test-self test-pdfs # Runs all tests.
@echo "All tests passed."
TESTS_DIR = tests
# Delegate to subdirectory and run the Makefile found there.
# Important for compatibility with CI config.
test-self: # Runs tests on the tests themselves.
@$(MAKE) --directory=$(TESTS_DIR) test-self
test-pdfs: # Runs tests on found PDFs.
@$(MAKE) --directory=$(TESTS_DIR) test-pdfs
# =====================================================================================
# Cleanup jobs.
# =====================================================================================
clean-aux: # Cleans LaTeX's auxiliary files.
@echo "Removing auxiliary files of all found TeX files..."
clean-pdf: # Cleans all found PDFs.
@echo "Removing all PDF files:"
@ls *.pdf 2>/dev/null || echo "No files to remove."
@$(RM) *.pdf
# For target name, see:
mostlyclean: clean-aux clean-pdf
mostlyclean: clean.aux clean.pdf # Runs clean.{aux,pdf}, then cleans more.
@echo "Removing downloaded pandoc archive, if any..."
clean: mostlyclean
clean: mostlyclean # Runs all other clean jobs, then cleans absolutely everything.
@echo "Removing all files ignored by git (.gitignore)..."
@echo "For safety, this is done interactively:"
@git clean -xd --interactive
# No Docker image available for this yet. Refer to the CI config to configure this for
# local use.
PYTEST = poetry run pytest
ifdef CI
PYTEST_FLAGS = --verbose
test-self: # Runs tests on the tests themselves.
test-pdfs: # Runs tests on found PDFs.
encryption: null # why tho
pdf_version: "1.5" # Versions are strings, not floats (see semantic versioning)
n_min: 1
n_max: null # Null will ignore this boundary
# This is either a mapping of <name>.pdf to <allowed papersize>, or one value for
# *all* PDFs, for example: `papersize: "a4"`.
# Possible values:
cookbook: "a5"
README: "a4"
# File sizes in human-readable format, or null to ignore.
min_size: "100K" # Some rough minimum, maybe useful to detect corrupted files
max_size: "5M"
bookmarks: true # Check if those are present
# For age, d, h, m and s (or null to skip test) are available. Order matters.
# Could be useful to detect old files that crept, e.g. from a cache.
max_age: "10m"
required_strings: null
profanity_allowed: false
This diff is collapsed.
name = "tests"
version = "1.3.0"
description = "Checks for PDFs."
authors = ["Alex Povel <>"]
python = "3.7.9"
pymupdf = "1.18.2"
profanity-check = "1.0.3"
scikit-learn = "0.20.2"
pytest = "^6.1.2"
PyYAML = "^5.3.1"
python-dateutil = "^2.8.1"
PaperSize = "^1.0.1"
ipython = "^7.19.0"
black = "^20.8b1"
requires = ["poetry>=0.12"]
build-backend = "poetry.masonry.api"
"""Strictly required for `pytest` to find this Python package and import from it."""
"""File to configure pytest, e.g. to implement hooks.
See also:
List of hooks:
from tests.utils import _PROJECT_ROOT
def pytest_make_parametrize_id(config, val, argname):
"""Provide IDs aka names for test cases.
pytest generates automatic IDs. Using this function, they can be altered to
whatever more legible representation, see
Implementing this function in a specific file using a specific name will hook it
into pytest and use it for *all* ID generation automatically, so no need to specify
`ids=<func>` all the time.
# Shorten filepath significantly, in relation to project root.
val = val.relative_to(_PROJECT_ROOT)
except AttributeError:
return str(val)
"""Tests whether PDFs documents fulfill certain criteria.
This is a bit backwards: `pytest` ordinarily tests functions/units, with sample data
being generated as needed. This module tests *data*, not the logic working on them.
For this, a package like `datatest` is maybe more suitable. However, the framework
provided by `pytest` works just fine. It gets us pretty summaries and the like, much
better than crafting that ourselves.
One core downside is that there is way too much logic in the tests themselves, with
no way of testing the tests that test the data. If these are wrong (likelihood increases
with more complexity), we are in for a bad time.
Idea from:
import math
from collections import namedtuple
from datetime import datetime as dt
from itertools import compress
from pathlib import Path
from typing import List
from warnings import warn
import fitz # PyMuPDF
import pytest
import yaml # PyYAML
from dateutil.parser import parse as parse_date
from papersize import parse_papersize
from profanity_check import predict
from tests.utils import (
# Monkey-patch a new, prettier property to access a page's text, instead of calling the
# camelCase method. PEP-8 for the win.
fitz.Page.text = property(lambda self: self.getText())
fitz.Document.toc = property(lambda self: self.getToC())
Rect = namedtuple("Rect", ["width", "height"])
def get_project_root_files(suffix: str) -> List[Path]:
yield from _PROJECT_ROOT.glob("*." + suffix)
def config():
with open(_PROJECT_ROOT / "tests" / "config.yml") as f:
return yaml.safe_load(f)
def pdf(request): # HAS to be named 'request'
doc =
# Context manager doesn't work, it closes too soon.
yield doc
def test_profanity(pdf, config):
if config["content"]["profanity_allowed"]:
pytest.skip("Profanity is allowed, ignoring test.")
for page in pdf:
lines = page.text.splitlines()
if lines:
predictions = predict(lines) # exception raised if argument empty
# Compressing loses information on line number. Not too bad because that is
# imprecise anyway, page number should be enough.
# Cast to list because iterator evaluates to True even if empty.
profanities = list(compress(lines, predictions))
assert (
not profanities
), f"Profanities {profanities} found on page {page.number + 1}!"
def test_page_numbers(pdf, config):
"""Tests that number of pages is within limits.
Zero-length PDFs don't exist, but perhaps some form of file corruption could produce
n = len(pdf)
page_config = config["pages"]
n_min = page_config["n_min"] or 1
n_max = page_config["n_max"] or math.inf
msg = (
f"Document page count out of bounds: {n} pages not within {n_min} and {n_max}."
assert n_min <= n <= n_max, msg
def test_bookmarks(pdf, config):
"""Checks if a table of contents (ToC) is present."""
bookmarks = config["file"]["bookmarks"]
not_ = "\b" if bookmarks else "not" # ASCII backspace, remove on space char
assert bool(pdf.toc) == bool(
), f"Bookmarks presence {not_} requested, but opposite found."
def test_required_strings(pdf, config):
"""Tests whether each required text is found in the document."""
content_config = config["content"]
for text in content_config["required_strings"] or []:
hits = sum(text in page.text for page in pdf)
assert hits, f"Text '{text}' not found in document."
def test_file_size(pdf, config):
"""Tests that filesize is within limits."""
file_config = config["file"]
size = Path(
min_size = parse_size(file_config["min_size"])