pandoc-reader/pandoc_reader.py

"""Reader that processes Pandoc Markdown and returns HTML5."""
import json
import math
import os
import shutil
import subprocess

import bs4
from ruamel.yaml import YAML, constructor

from pelican import signals
from pelican.readers import BaseReader
from pelican.utils import pelican_open

DEFAULT_READING_SPEED = 200  # Words per minute
DEFAULT_PANDOC_EXECUTABLE = "pandoc"
DIR_PATH = os.path.dirname(__file__)
ENCODED_LINKS_TO_RAW_LINKS_MAP = {
    "%7Bstatic%7D": "{static}",
    "%7Battach%7D": "{attach}",
    "%7Bfilename%7D": "{filename}",
}
FILE_EXTENSIONS = ["md", "mkd", "mkdn", "mdwn", "mdown", "markdown", "Rmd"]
PANDOC_READER_HTML_TEMPLATE = "pandoc-reader-default.html"
PANDOC_SUPPORTED_MAJOR_VERSION = 2
PANDOC_SUPPORTED_MINOR_VERSION = 11

TEMPLATES_PATH = os.path.abspath(os.path.join(DIR_PATH, "templates"))
UNSUPPORTED_ARGUMENTS = ("--standalone", "--self-contained")
VALID_BIB_EXTENSIONS = ["json", "yaml", "bibtex", "bib"]

# Markdown variants supported in defaults files
# Update as Pandoc adds or removes support for formats
VALID_INPUT_FORMATS = (
    "commonmark",
    "commonmark_x",
    "gfm",
    "markdown",
    "markdown_mmd",
    "markdown_phpextra",
    "markdown_strict",
)
VALID_OUTPUT_FORMATS = ("html", "html5")


class PandocReader(BaseReader):
    """Convert files written in Pandoc Markdown to HTML 5."""

    enabled = True
    file_extensions = FILE_EXTENSIONS

    def read(self, source_path):
        """Parse Pandoc Markdown and return HTML5 markup and metadata."""
        # Get the user-defined path to the Pandoc executable or fall back to default
        pandoc_executable = self.settings.get(
            "PANDOC_EXECUTABLE_PATH", DEFAULT_PANDOC_EXECUTABLE
        )

        # If user-defined path, expand and make it absolute in case the path is relative
        if pandoc_executable != DEFAULT_PANDOC_EXECUTABLE:
            pandoc_executable = os.path.abspath(os.path.expanduser(pandoc_executable))

        # Check if pandoc is installed and is executable
        if not shutil.which(pandoc_executable):
            raise Exception("Could not find Pandoc. Please install.")

        # Check if the version of pandoc installed is 2.11 or higher
        self._check_pandoc_version(pandoc_executable)

        # Open Markdown file and read content
        content = ""
        with pelican_open(source_path) as file_content:
            content = file_content

        # Retrieve HTML content and metadata
        output, metadata = self._create_html(source_path, content, pandoc_executable)

        return output, metadata

    def _create_html(self, source_path, content, pandoc_executable):
        """Create HTML5 content."""
        # Get settings set in pelicanconf.py
        defaults_files = self.settings.get("PANDOC_DEFAULTS_FILES", [])

        # Adding support for the old defaults file setting
        # which will be removed in future versions of this plugin
        if self.settings.get("PANDOC_DEFAULT_FILES", []):
            defaults_files = self.settings.get("PANDOC_DEFAULT_FILES")

        arguments = self.settings.get("PANDOC_ARGS", [])
        extensions = self.settings.get("PANDOC_EXTENSIONS", [])

        if isinstance(extensions, list):
            extensions = "".join(extensions)

        # Check if source content has a YAML metadata block
        self._check_yaml_metadata_block(content)

        # Check validity of arguments or defaults files
        table_of_contents, citations = self._validate_fields(
            defaults_files, arguments, extensions
        )

        # Construct preliminary pandoc command
        pandoc_cmd = self._construct_pandoc_command(
            pandoc_executable, defaults_files, arguments, extensions
        )

        # Find and add bibliography if citations are specified
        if citations:
            for bib_file in self._find_bibs(source_path):
                pandoc_cmd.append("--bibliography={0}".format(bib_file))

        # Create HTML content using pandoc-reader-default.html template
        output = self._run_pandoc(pandoc_cmd, content)

        # Extract table of contents, text and metadata from HTML output
        output, toc, pandoc_metadata = self._extract_contents(output, table_of_contents)

        # Replace all occurrences of %7Bstatic%7D to {static},
        # %7Battach%7D to {attach} and %7Bfilename%7D to {filename}
        # so that static links are resolvable by pelican
        for encoded_str, raw_str in ENCODED_LINKS_TO_RAW_LINKS_MAP.items():
            output = output.replace(encoded_str, raw_str)

        # Parse Pandoc metadata and add it to Pelican
        metadata = self._process_metadata(pandoc_metadata)

        if table_of_contents:
            # Create table of contents and add to metadata
            metadata["toc"] = self.process_metadata("toc", toc)

        return output, metadata

    def _validate_fields(self, defaults_files, arguments, extensions):
        """Validate fields and return citations and ToC request values."""
        # If defaults_files is empty then validate the argument and extensions
        if not defaults_files:
            # Validate the arguments to see that they are supported
            # by the plugin
            self._check_arguments(arguments)

            # Check if citations have been requested
            citations = self._check_if_citations(arguments, extensions)

            # Check if table of contents has been requested
            table_of_contents = self._check_if_toc(arguments)
        else:
            # Validate defaults files and get the citations
            # abd table of contents request value
            citations, table_of_contents = self._check_defaults(defaults_files)
        return table_of_contents, citations

    def _check_defaults(self, defaults_files):
        """Check if the given Pandoc defaults file has valid values."""
        citations = False
        table_of_contents = False

        # Get the data in all defaults files as a string
        defaults_data = ""
        for defaults_file in defaults_files:
            with open(defaults_file, "r") as file_handle:
                for line in file_handle.readlines():
                    defaults_data += line

        # Convert YAML data to a Python dictionary
        defaults = {}
        try:
            yaml = YAML()
            defaults = yaml.load(defaults_data)
        except constructor.DuplicateKeyError as duplicate_key_error:
            raise ValueError(
                "Duplicate keys defined in multiple defaults files."
            ) from duplicate_key_error

        self._check_if_unsupported_settings(defaults)
        reader = self._check_input_format(defaults)
        self._check_output_format(defaults)

        if not citations:
            citeproc_specified = False

            # Cases where citeproc is specified as citeproc: true
            if defaults.get("citeproc", ""):
                citeproc_specified = True

            # Cases where citeproc is specified in filters
            elif "citeproc" in defaults.get("filters", ""):
                citeproc_specified = True

            # The extension +citations is enabled by default in Pandoc 2.11
            # we are checking that the extension is not disabled using -citations
            if citeproc_specified and "-citations" not in reader:
                citations = True

        if not table_of_contents:
            if defaults.get("table-of-contents", ""):
                table_of_contents = True

        return citations, table_of_contents

    def _process_metadata(self, pandoc_metadata):
        """Process Pandoc metadata and add it to Pelican."""
        # Cycle through the metadata and process them
        metadata = {}
        for key, value in pandoc_metadata.items():
            key = key.lower()
            if value and isinstance(value, str):
                value = value.strip().strip('"')

            # Process the metadata
            metadata[key] = self.process_metadata(key, value)
        return metadata

    @staticmethod
    def _check_pandoc_version(pandoc_executable):
        """Check that the specified version of Pandoc is 2.11 or higher."""
        output = subprocess.run(
            [pandoc_executable, "--version"],
            capture_output=True,
            encoding="utf-8",
            check=True,
        )

        # Returns a string of the form pandoc <version>
        pandoc_version = output.stdout.split("\n")[0]

        # Get the major and minor version from the above version string
        major_version = pandoc_version.split()[1].split(".")[0]
        minor_version = pandoc_version.split()[1].split(".")[1]

        # Pandoc major version less than 2 are not supported
        if int(major_version) < PANDOC_SUPPORTED_MAJOR_VERSION:
            raise Exception("Pandoc version must be 2.11 or higher.")

        # Pandoc major version 2 minor version less than 11 are not supported
        if (
            int(major_version) == PANDOC_SUPPORTED_MAJOR_VERSION
            and int(minor_version) < PANDOC_SUPPORTED_MINOR_VERSION
        ):
            raise Exception("Pandoc version must be 2.11 or higher.")

    @staticmethod
    def _check_yaml_metadata_block(content):
        """Check if the source content has a YAML metadata block."""
        # Check that the given content is not empty
        if not content:
            raise Exception("Could not find metadata. File is empty.")

        # Split content into a list of lines
        content_lines = content.splitlines()

        # Check that the first line of the file starts with a YAML block
        if content_lines[0].rstrip() not in ["---"]:
            raise Exception("Could not find metadata header '---'.")

        # Find the end of the YAML block
        yaml_block_end = ""
        for line_num, line in enumerate(content_lines[1:]):
            if line.rstrip() in ["---", "..."]:
                yaml_block_end = line_num
                break

        # Check if the end of the YAML block was found
        if not yaml_block_end:
            raise Exception("Could not find end of metadata block.")

    @staticmethod
    def _construct_pandoc_command(
        pandoc_executable, defaults_files, arguments, extensions
    ):
        """Construct Pandoc command for content."""
        pandoc_cmd = [
            pandoc_executable,
            "--standalone",
            "--template={}".format(
                os.path.join(TEMPLATES_PATH, PANDOC_READER_HTML_TEMPLATE)
            ),
        ]
        if not defaults_files:
            pandoc_cmd.extend(["--from", "markdown" + extensions, "--to", "html5"])
            pandoc_cmd.extend(arguments)
        else:
            for defaults_file in defaults_files:
                pandoc_cmd.append("--defaults={0}".format(defaults_file))
        return pandoc_cmd

    @staticmethod
    def _run_pandoc(pandoc_cmd, content):
        """Execute the given pandoc command and return output."""
        output = subprocess.run(
            pandoc_cmd,
            input=content,
            capture_output=True,
            encoding="utf-8",
            check=True,
        )
        return output.stdout

    @staticmethod
    def _extract_contents(html_output, table_of_contents):
        """Extract body html, table of contents and metadata from output."""
        # Extract pandoc metadata from html output
        pandoc_json_metadata, _, html_output = html_output.partition("\n")

        # Convert JSON string to dict
        pandoc_metadata = json.loads(pandoc_json_metadata)

        # Parse HTML output
        soup = bs4.BeautifulSoup(html_output, "html.parser")

        # Extract the table of contents if one was requested
        toc = ""
        if table_of_contents:
            # Find the table of contents
            toc = soup.body.find("nav", id="TOC")

            if toc:
                # Convert it to a string
                toc = str(toc)

                # Replace id=TOC with class="toc"
                toc = toc.replace('id="TOC"', 'class="toc"')

                # Remove the table of contents from the HTML output
                soup.body.find("nav", id="TOC").decompose()

        # Remove body tag around html output
        soup.body.unwrap()

        # Strip leading and trailing spaces
        html_output = str(soup).strip()

        return html_output, toc, pandoc_metadata

    @staticmethod
    def _check_if_citations(arguments, extensions):
        """Check if citations are specified."""
        citations = False
        if arguments and extensions:
            # The +citations extension is enabled by default in Pandoc 2.11
            # therefore we do a check to see that it is not disabled in extensions
            if (
                "--citeproc" in arguments or "-C" in arguments
            ) and "-citations" not in extensions:
                citations = True
        return citations

    @staticmethod
    def _check_if_toc(arguments):
        """Check if a table of contents should be generated."""
        table_of_contents = False
        if arguments:
            if "--toc" in arguments or "--table-of-contents" in arguments:
                table_of_contents = True
        return table_of_contents

    @staticmethod
    def _find_bibs(source_path):
        """Find bibliographies recursively in the sourcepath given."""
        bib_files = []
        filename = os.path.splitext(os.path.basename(source_path))[0]
        directory_path = os.path.dirname(os.path.abspath(source_path))
        for root, _, files in os.walk(directory_path):
            for extension in VALID_BIB_EXTENSIONS:
                bib_name = ".".join([filename, extension])
                if bib_name in files:
                    bib_files.append(os.path.join(root, bib_name))
        return bib_files

    @staticmethod
    def _check_arguments(arguments):
        """Check to see that only supported arguments have been passed."""
        for arg in arguments:
            if arg in UNSUPPORTED_ARGUMENTS:
                raise ValueError("Argument {0} is not supported.".format(arg))

    @staticmethod
    def _check_if_unsupported_settings(defaults):
        """Check if unsupported settings are specified in the defaults."""
        for arg in UNSUPPORTED_ARGUMENTS:
            arg = arg[2:]
            if defaults.get(arg, ""):
                raise ValueError("The default {} should be set to false.".format(arg))

    @staticmethod
    def _check_input_format(defaults):
        """Check if the input format given is a Markdown variant."""
        reader = ""
        reader_input = defaults.get("reader", "")
        from_input = defaults.get("from", "")

        # Case where no input format is specified
        if not reader_input and not from_input:
            raise ValueError("No input format specified.")

        # Case where both reader and from are specified which is not supported
        if reader_input and from_input:
            raise ValueError(
                (
                    "Specifying both from and reader is not supported."
                    " Please specify just one."
                )
            )

        if reader_input or from_input:
            if reader_input:
                reader = reader_input
            elif from_input:
                reader = from_input

            reader_prefix = reader.replace("+", "-").split("-")[0]

            # Check to see if the reader_prefix matches a valid input format
            if reader_prefix not in VALID_INPUT_FORMATS:
                raise ValueError("Input type has to be a Markdown variant.")
        return reader

    @staticmethod
    def _check_output_format(defaults):
        """Check if the output format is HTML or HTML5."""
        writer_output = defaults.get("writer", "")
        to_output = defaults.get("to", "")

        # Case where both writer and to are specified which is not supported
        if writer_output and to_output:
            raise ValueError(
                (
                    "Specifying both to and writer is not supported."
                    " Please specify just one."
                )
            )

        # Case where neither writer nor to value is set to html
        if (
            writer_output not in VALID_OUTPUT_FORMATS
            and to_output not in VALID_OUTPUT_FORMATS
        ):
            output_formats = " or ".join(VALID_OUTPUT_FORMATS)
            raise ValueError(
                "Output format type must be either {}.".format(output_formats)
            )


def add_reader(readers):
    """Add the PandocReader as the reader for all Pandoc Markdown files."""
    for ext in PandocReader.file_extensions:
        readers.reader_classes[ext] = PandocReader


def register():
    """Register the PandocReader."""
    signals.readers_init.connect(add_reader)
Обновление 2023-02-20 13:54:46 +00:00			`"""Reader that processes Pandoc Markdown and returns HTML5."""`
			`import json`
			`import math`
			`import os`
			`import shutil`
Fixes and enhancements including: * Add PANDOC_EXTENSIONS configuration variable, allowing one to enable or disable Pandoc's markdown extensions individually. * Remove dependency on pypandoc. * Don't change the working directory. * More efficient metadata extraction. 2014-11-26 18:42:37 +00:00			`import subprocess`
Add support for parsing YAML metadata. 2015-05-16 18:13:47 +00:00
Обновление 2023-02-20 13:54:46 +00:00			`import bs4`
			`from ruamel.yaml import YAML, constructor`

added actual code 2014-03-26 10:35:27 +00:00			`from pelican import signals`
			`from pelican.readers import BaseReader`
use pelican_open in order to read the file 2014-03-30 14:02:53 +00:00			`from pelican.utils import pelican_open`
prettify code 2014-08-27 07:41:47 +00:00
Обновление 2023-02-20 13:54:46 +00:00			`DEFAULT_READING_SPEED = 200 # Words per minute`
			`DEFAULT_PANDOC_EXECUTABLE = "pandoc"`
			`DIR_PATH = os.path.dirname(__file__)`
			`ENCODED_LINKS_TO_RAW_LINKS_MAP = {`
			`"%7Bstatic%7D": "{static}",`
			`"%7Battach%7D": "{attach}",`
			`"%7Bfilename%7D": "{filename}",`
			`}`
			`FILE_EXTENSIONS = ["md", "mkd", "mkdn", "mdwn", "mdown", "markdown", "Rmd"]`
			`PANDOC_READER_HTML_TEMPLATE = "pandoc-reader-default.html"`
			`PANDOC_SUPPORTED_MAJOR_VERSION = 2`
			`PANDOC_SUPPORTED_MINOR_VERSION = 11`

			`TEMPLATES_PATH = os.path.abspath(os.path.join(DIR_PATH, "templates"))`
			`UNSUPPORTED_ARGUMENTS = ("--standalone", "--self-contained")`
			`VALID_BIB_EXTENSIONS = ["json", "yaml", "bibtex", "bib"]`

			`# Markdown variants supported in defaults files`
			`# Update as Pandoc adds or removes support for formats`
			`VALID_INPUT_FORMATS = (`
			`"commonmark",`
			`"commonmark_x",`
			`"gfm",`
			`"markdown",`
			`"markdown_mmd",`
			`"markdown_phpextra",`
			`"markdown_strict",`
			`)`
			`VALID_OUTPUT_FORMATS = ("html", "html5")`
Add support for parsing YAML metadata. 2015-05-16 18:13:47 +00:00

prettify code 2014-08-27 07:41:47 +00:00			`class PandocReader(BaseReader):`
Обновление 2023-02-20 13:54:46 +00:00			`"""Convert files written in Pandoc Markdown to HTML 5."""`

added actual code 2014-03-26 10:35:27 +00:00			`enabled = True`
Обновление 2023-02-20 13:54:46 +00:00			`file_extensions = FILE_EXTENSIONS`

			`def read(self, source_path):`
			`"""Parse Pandoc Markdown and return HTML5 markup and metadata."""`
			`# Get the user-defined path to the Pandoc executable or fall back to default`
			`pandoc_executable = self.settings.get(`
			`"PANDOC_EXECUTABLE_PATH", DEFAULT_PANDOC_EXECUTABLE`
			`)`

			`# If user-defined path, expand and make it absolute in case the path is relative`
			`if pandoc_executable != DEFAULT_PANDOC_EXECUTABLE:`
			`pandoc_executable = os.path.abspath(os.path.expanduser(pandoc_executable))`

			`# Check if pandoc is installed and is executable`
			`if not shutil.which(pandoc_executable):`
			`raise Exception("Could not find Pandoc. Please install.")`

			`# Check if the version of pandoc installed is 2.11 or higher`
			`self._check_pandoc_version(pandoc_executable)`

			`# Open Markdown file and read content`
			`content = ""`
			`with pelican_open(source_path) as file_content:`
			`content = file_content`

			`# Retrieve HTML content and metadata`
			`output, metadata = self._create_html(source_path, content, pandoc_executable)`

			`return output, metadata`

			`def _create_html(self, source_path, content, pandoc_executable):`
			`"""Create HTML5 content."""`
			`# Get settings set in pelicanconf.py`
			`defaults_files = self.settings.get("PANDOC_DEFAULTS_FILES", [])`

			`# Adding support for the old defaults file setting`
			`# which will be removed in future versions of this plugin`
			`if self.settings.get("PANDOC_DEFAULT_FILES", []):`
			`defaults_files = self.settings.get("PANDOC_DEFAULT_FILES")`

			`arguments = self.settings.get("PANDOC_ARGS", [])`
			`extensions = self.settings.get("PANDOC_EXTENSIONS", [])`

			`if isinstance(extensions, list):`
			`extensions = "".join(extensions)`

			`# Check if source content has a YAML metadata block`
			`self._check_yaml_metadata_block(content)`

			`# Check validity of arguments or defaults files`
			`table_of_contents, citations = self._validate_fields(`
			`defaults_files, arguments, extensions`
			`)`

			`# Construct preliminary pandoc command`
			`pandoc_cmd = self._construct_pandoc_command(`
			`pandoc_executable, defaults_files, arguments, extensions`
			`)`

			`# Find and add bibliography if citations are specified`
			`if citations:`
			`for bib_file in self._find_bibs(source_path):`
			`pandoc_cmd.append("--bibliography={0}".format(bib_file))`

			`# Create HTML content using pandoc-reader-default.html template`
			`output = self._run_pandoc(pandoc_cmd, content)`

			`# Extract table of contents, text and metadata from HTML output`
			`output, toc, pandoc_metadata = self._extract_contents(output, table_of_contents)`

			`# Replace all occurrences of %7Bstatic%7D to {static},`
			`# %7Battach%7D to {attach} and %7Bfilename%7D to {filename}`
			`# so that static links are resolvable by pelican`
			`for encoded_str, raw_str in ENCODED_LINKS_TO_RAW_LINKS_MAP.items():`
			`output = output.replace(encoded_str, raw_str)`

			`# Parse Pandoc metadata and add it to Pelican`
			`metadata = self._process_metadata(pandoc_metadata)`

			`if table_of_contents:`
			`# Create table of contents and add to metadata`
			`metadata["toc"] = self.process_metadata("toc", toc)`

			`return output, metadata`

			`def _validate_fields(self, defaults_files, arguments, extensions):`
			`"""Validate fields and return citations and ToC request values."""`
			`# If defaults_files is empty then validate the argument and extensions`
			`if not defaults_files:`
			`# Validate the arguments to see that they are supported`
			`# by the plugin`
			`self._check_arguments(arguments)`

			`# Check if citations have been requested`
			`citations = self._check_if_citations(arguments, extensions)`

			`# Check if table of contents has been requested`
			`table_of_contents = self._check_if_toc(arguments)`
			`else:`
			`# Validate defaults files and get the citations`
			`# abd table of contents request value`
			`citations, table_of_contents = self._check_defaults(defaults_files)`
			`return table_of_contents, citations`

			`def _check_defaults(self, defaults_files):`
			`"""Check if the given Pandoc defaults file has valid values."""`
			`citations = False`
			`table_of_contents = False`

			`# Get the data in all defaults files as a string`
			`defaults_data = ""`
			`for defaults_file in defaults_files:`
			`with open(defaults_file, "r") as file_handle:`
			`for line in file_handle.readlines():`
			`defaults_data += line`

			`# Convert YAML data to a Python dictionary`
			`defaults = {}`
			`try:`
			`yaml = YAML()`
			`defaults = yaml.load(defaults_data)`
			`except constructor.DuplicateKeyError as duplicate_key_error:`
			`raise ValueError(`
			`"Duplicate keys defined in multiple defaults files."`
			`) from duplicate_key_error`

			`self._check_if_unsupported_settings(defaults)`
			`reader = self._check_input_format(defaults)`
			`self._check_output_format(defaults)`

			`if not citations:`
			`citeproc_specified = False`
added actual code 2014-03-26 10:35:27 +00:00
Обновление 2023-02-20 13:54:46 +00:00			`# Cases where citeproc is specified as citeproc: true`
			`if defaults.get("citeproc", ""):`
			`citeproc_specified = True`

			`# Cases where citeproc is specified in filters`
			`elif "citeproc" in defaults.get("filters", ""):`
			`citeproc_specified = True`

			`# The extension +citations is enabled by default in Pandoc 2.11`
			`# we are checking that the extension is not disabled using -citations`
			`if citeproc_specified and "-citations" not in reader:`
			`citations = True`

			`if not table_of_contents:`
			`if defaults.get("table-of-contents", ""):`
			`table_of_contents = True`

			`return citations, table_of_contents`

			`def _process_metadata(self, pandoc_metadata):`
			`"""Process Pandoc metadata and add it to Pelican."""`
			`# Cycle through the metadata and process them`
Add support for parsing YAML metadata. 2015-05-16 18:13:47 +00:00			`metadata = {}`
Обновление 2023-02-20 13:54:46 +00:00			`for key, value in pandoc_metadata.items():`
			`key = key.lower()`
			`if value and isinstance(value, str):`
			`value = value.strip().strip('"')`

			`# Process the metadata`
			`metadata[key] = self.process_metadata(key, value)`
			`return metadata`

			`@staticmethod`
			`def _check_pandoc_version(pandoc_executable):`
			`"""Check that the specified version of Pandoc is 2.11 or higher."""`
			`output = subprocess.run(`
			`[pandoc_executable, "--version"],`
			`capture_output=True,`
			`encoding="utf-8",`
			`check=True,`
			`)`
Add support for parsing YAML metadata. 2015-05-16 18:13:47 +00:00
Обновление 2023-02-20 13:54:46 +00:00			`# Returns a string of the form pandoc <version>`
			`pandoc_version = output.stdout.split("\n")[0]`
Add support for parsing YAML metadata. 2015-05-16 18:13:47 +00:00
Обновление 2023-02-20 13:54:46 +00:00			`# Get the major and minor version from the above version string`
			`major_version = pandoc_version.split()[1].split(".")[0]`
			`minor_version = pandoc_version.split()[1].split(".")[1]`
Add support for parsing YAML metadata. 2015-05-16 18:13:47 +00:00
Обновление 2023-02-20 13:54:46 +00:00			`# Pandoc major version less than 2 are not supported`
			`if int(major_version) < PANDOC_SUPPORTED_MAJOR_VERSION:`
			`raise Exception("Pandoc version must be 2.11 or higher.")`
Add support for parsing YAML metadata. 2015-05-16 18:13:47 +00:00
Обновление 2023-02-20 13:54:46 +00:00			`# Pandoc major version 2 minor version less than 11 are not supported`
			`if (`
			`int(major_version) == PANDOC_SUPPORTED_MAJOR_VERSION`
			`and int(minor_version) < PANDOC_SUPPORTED_MINOR_VERSION`
			`):`
			`raise Exception("Pandoc version must be 2.11 or higher.")`
Add support for parsing YAML metadata. 2015-05-16 18:13:47 +00:00
Обновление 2023-02-20 13:54:46 +00:00			`@staticmethod`
			`def _check_yaml_metadata_block(content):`
			`"""Check if the source content has a YAML metadata block."""`
			`# Check that the given content is not empty`
			`if not content:`
			`raise Exception("Could not find metadata. File is empty.")`
Allow empty summary 2019-04-25 11:52:57 +00:00
Обновление 2023-02-20 13:54:46 +00:00			`# Split content into a list of lines`
			`content_lines = content.splitlines()`
Usable, albeit slightly hackish, solution for YAML metadata. 2016-08-05 02:52:50 +00:00
Обновление 2023-02-20 13:54:46 +00:00			`# Check that the first line of the file starts with a YAML block`
			`if content_lines[0].rstrip() not in ["---"]:`
			`raise Exception("Could not find metadata header '---'.")`

			`# Find the end of the YAML block`
			`yaml_block_end = ""`
			`for line_num, line in enumerate(content_lines[1:]):`
			`if line.rstrip() in ["---", "..."]:`
			`yaml_block_end = line_num`
			`break`

			`# Check if the end of the YAML block was found`
			`if not yaml_block_end:`
			`raise Exception("Could not find end of metadata block.")`

			`@staticmethod`
			`def _construct_pandoc_command(`
			`pandoc_executable, defaults_files, arguments, extensions`
			`):`
			`"""Construct Pandoc command for content."""`
			`pandoc_cmd = [`
			`pandoc_executable,`
			`"--standalone",`
			`"--template={}".format(`
			`os.path.join(TEMPLATES_PATH, PANDOC_READER_HTML_TEMPLATE)`
			`),`
			`]`
			`if not defaults_files:`
			`pandoc_cmd.extend(["--from", "markdown" + extensions, "--to", "html5"])`
			`pandoc_cmd.extend(arguments)`
Add support for parsing YAML metadata. 2015-05-16 18:13:47 +00:00			`else:`
Обновление 2023-02-20 13:54:46 +00:00			`for defaults_file in defaults_files:`
			`pandoc_cmd.append("--defaults={0}".format(defaults_file))`
			`return pandoc_cmd`

			`@staticmethod`
			`def _run_pandoc(pandoc_cmd, content):`
			`"""Execute the given pandoc command and return output."""`
			`output = subprocess.run(`
attempt to enable bibligraphy but founder in relative file paths 2019-02-23 05:14:24 +00:00			`pandoc_cmd,`
Обновление 2023-02-20 13:54:46 +00:00			`input=content,`
			`capture_output=True,`
			`encoding="utf-8",`
			`check=True,`
			`)`
			`return output.stdout`
use brandonwillard's less crazy link preservation' 2018-07-14 02:32:30 +00:00
Обновление 2023-02-20 13:54:46 +00:00			`@staticmethod`
			`def _extract_contents(html_output, table_of_contents):`
			`"""Extract body html, table of contents and metadata from output."""`
			`# Extract pandoc metadata from html output`
			`pandoc_json_metadata, _, html_output = html_output.partition("\n")`

			`# Convert JSON string to dict`
			`pandoc_metadata = json.loads(pandoc_json_metadata)`

			`# Parse HTML output`
			`soup = bs4.BeautifulSoup(html_output, "html.parser")`

			`# Extract the table of contents if one was requested`
			`toc = ""`
			`if table_of_contents:`
			`# Find the table of contents`
			`toc = soup.body.find("nav", id="TOC")`

			`if toc:`
			`# Convert it to a string`
			`toc = str(toc)`

			`# Replace id=TOC with class="toc"`
			`toc = toc.replace('id="TOC"', 'class="toc"')`

			`# Remove the table of contents from the HTML output`
			`soup.body.find("nav", id="TOC").decompose()`

			`# Remove body tag around html output`
			`soup.body.unwrap()`

			`# Strip leading and trailing spaces`
			`html_output = str(soup).strip()`

			`return html_output, toc, pandoc_metadata`

			`@staticmethod`
			`def _check_if_citations(arguments, extensions):`
			`"""Check if citations are specified."""`
			`citations = False`
			`if arguments and extensions:`
			`# The +citations extension is enabled by default in Pandoc 2.11`
			`# therefore we do a check to see that it is not disabled in extensions`
			`if (`
			`"--citeproc" in arguments or "-C" in arguments`
			`) and "-citations" not in extensions:`
			`citations = True`
			`return citations`

			`@staticmethod`
			`def _check_if_toc(arguments):`
			`"""Check if a table of contents should be generated."""`
			`table_of_contents = False`
			`if arguments:`
			`if "--toc" in arguments or "--table-of-contents" in arguments:`
			`table_of_contents = True`
			`return table_of_contents`

			`@staticmethod`
			`def _find_bibs(source_path):`
			`"""Find bibliographies recursively in the sourcepath given."""`
			`bib_files = []`
			`filename = os.path.splitext(os.path.basename(source_path))[0]`
			`directory_path = os.path.dirname(os.path.abspath(source_path))`
			`for root, _, files in os.walk(directory_path):`
			`for extension in VALID_BIB_EXTENSIONS:`
			`bib_name = ".".join([filename, extension])`
			`if bib_name in files:`
			`bib_files.append(os.path.join(root, bib_name))`
			`return bib_files`

			`@staticmethod`
			`def _check_arguments(arguments):`
			`"""Check to see that only supported arguments have been passed."""`
			`for arg in arguments:`
			`if arg in UNSUPPORTED_ARGUMENTS:`
			`raise ValueError("Argument {0} is not supported.".format(arg))`

			`@staticmethod`
			`def _check_if_unsupported_settings(defaults):`
			`"""Check if unsupported settings are specified in the defaults."""`
			`for arg in UNSUPPORTED_ARGUMENTS:`
			`arg = arg[2:]`
			`if defaults.get(arg, ""):`
			`raise ValueError("The default {} should be set to false.".format(arg))`

			`@staticmethod`
			`def _check_input_format(defaults):`
			`"""Check if the input format given is a Markdown variant."""`
			`reader = ""`
			`reader_input = defaults.get("reader", "")`
			`from_input = defaults.get("from", "")`

			`# Case where no input format is specified`
			`if not reader_input and not from_input:`
			`raise ValueError("No input format specified.")`

			`# Case where both reader and from are specified which is not supported`
			`if reader_input and from_input:`
			`raise ValueError(`
			`(`
			`"Specifying both from and reader is not supported."`
			`" Please specify just one."`
			`)`
			`)`

			`if reader_input or from_input:`
			`if reader_input:`
			`reader = reader_input`
			`elif from_input:`
			`reader = from_input`

			`reader_prefix = reader.replace("+", "-").split("-")[0]`

			`# Check to see if the reader_prefix matches a valid input format`
			`if reader_prefix not in VALID_INPUT_FORMATS:`
			`raise ValueError("Input type has to be a Markdown variant.")`
			`return reader`

			`@staticmethod`
			`def _check_output_format(defaults):`
			`"""Check if the output format is HTML or HTML5."""`
			`writer_output = defaults.get("writer", "")`
			`to_output = defaults.get("to", "")`

			`# Case where both writer and to are specified which is not supported`
			`if writer_output and to_output:`
			`raise ValueError(`
			`(`
			`"Specifying both to and writer is not supported."`
			`" Please specify just one."`
			`)`
			`)`

			`# Case where neither writer nor to value is set to html`
			`if (`
			`writer_output not in VALID_OUTPUT_FORMATS`
			`and to_output not in VALID_OUTPUT_FORMATS`
			`):`
			`output_formats = " or ".join(VALID_OUTPUT_FORMATS)`
			`raise ValueError(`
			`"Output format type must be either {}.".format(output_formats)`
			`)`
prettify code 2014-08-27 07:41:47 +00:00
Add support for parsing YAML metadata. 2015-05-16 18:13:47 +00:00
added actual code 2014-03-26 10:35:27 +00:00			`def add_reader(readers):`
Обновление 2023-02-20 13:54:46 +00:00			`"""Add the PandocReader as the reader for all Pandoc Markdown files."""`
pandoc reader for all markdown extensions This makes the PandocReader the default markdown reader for all markdown extensions not just "md" 2014-11-25 16:23:07 +00:00			`for ext in PandocReader.file_extensions:`
			`readers.reader_classes[ext] = PandocReader`
prettify code 2014-08-27 07:41:47 +00:00
Add support for parsing YAML metadata. 2015-05-16 18:13:47 +00:00
added actual code 2014-03-26 10:35:27 +00:00			`def register():`
Обновление 2023-02-20 13:54:46 +00:00			`"""Register the PandocReader."""`
added actual code 2014-03-26 10:35:27 +00:00			`signals.readers_init.connect(add_reader)`