From ad070eb927df66162ff10ac2256fc5e185d086cf Mon Sep 17 00:00:00 2001 From: Andrey Astafyev Date: Mon, 20 Feb 2023 16:54:46 +0300 Subject: [PATCH] =?UTF-8?q?=D0=9E=D0=B1=D0=BD=D0=BE=D0=B2=D0=BB=D0=B5?= =?UTF-8?q?=D0=BD=D0=B8=D0=B5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 286 ++++++++++++++++++++----- __init__.py | 3 +- pandoc_reader.py | 548 ++++++++++++++++++++++++++++++++++++++--------- 3 files changed, 691 insertions(+), 146 deletions(-) diff --git a/README.md b/README.md index 057b456..3175968 100644 --- a/README.md +++ b/README.md @@ -1,76 +1,266 @@ -pandoc_reader -============= +Pandoc Reader: A Plugin for Pelican +=================================== -A pandoc [markdown] reader plugin for [pelican] +[![Build Status](https://img.shields.io/github/workflow/status/pelican-plugins/pandoc-reader/build)](https://github.com/pelican-plugins/pandoc-reader/actions) +[![PyPI Version](https://img.shields.io/pypi/v/pelican-pandoc-reader)](https://pypi.org/project/pelican-pandoc-reader/) +![License](https://img.shields.io/pypi/l/pelican-pandoc-reader?color=blue) +Pandoc Reader is a [Pelican][] plugin that converts documents written in [Pandoc’s variant of Markdown][] into HTML. Requirements ------------ - - [pandoc] in $PATH - - [PyYAML] installed if you want to parse [YAML metadata] +This plugin requires: + +* Python 3.7 or higher; and +* Pandoc 2.11 or higher [[Pandoc installation instructions](https://pandoc.org/installing.html)]. + +By default, the plugin looks for a `pandoc` executable on your `PATH`. If you wish, [you can specify an alternative location for your `pandoc` executable](#customizing-path-for-pandoc-executable). Installation ------------ -Instructions for installation of pelican plugins can be obtained from the [pelican plugin manual](https://github.com/getpelican/pelican-plugins/blob/master/Readme.rst). +This plugin can be installed via: +```bash +python -m pip install pelican-pandoc-reader +``` Configuration ------------- -Additional command line parameters can be passed to pandoc via the PANDOC_ARGS parameter. +This plugin converts [Pandoc’s variant of Markdown][] into HTML. Conversion from other Markdown variants is supported but requires the use of a [Pandoc defaults file][]. - PANDOC_ARGS = [ - '--mathjax', - '--smart', - '--toc', - '--toc-depth=2', - '--number-sections', - ] +Converting to output formats other than HTML is not supported. -Pandoc's markdown extensions can be enabled or disabled via the -PANDOC_EXTENSIONS parameter. +### Specifying File Metadata - PANDOC_EXTENSIONS = [ - '+hard_line_breaks', - '-citations' - ] +The plugin expects all Markdown files to start with a YAML-formatted content header, as shown below. +```yaml +--- +title: "" +author: "" +data: "" +--- +``` -YAML Metadata -------------- +… or … -No configuration is required to use YAML metadata. Simply include it at the top -of your post, started by `---` and terminated by `---` or `...`. If PyYAML is -not installed, the data will be parsed by the normal metadata parser instead. -For example: +```yaml +--- +title: "" +author: "" +date: "" +... +``` - --- - title: Using YAML with Pandoc! - author: Your Name - date: 2015-05-15 14:07 - description: > - You can include long, multiline descriptions which - can wrap across multiple lines (and will be joined - by YAML). - complex: - - or complex data structures - - like lists - ... +> ⚠️ **Note:** The YAML-formatted header shown above is syntax specific to Pandoc for specifying content metadata. This is different from Pelican’s front-matter format. If you ever decide to stop using this plugin and switch to Pelican’s default Markdown handling, you may need to switch your front-matter metadata to [Python-Markdown’s Meta-Data format](https://python-markdown.github.io/extensions/meta_data/). + +If you have files that use Pelican's front matter format, there is a script written by [Joseph Reagle](https://github.com/reagle) available that [converts Pelican's front matter to Pandoc's YAML header format](https://gist.github.com/reagle/5bc44ba9e2f1b961d1aaca9179fb403b). + +For more information on Pandoc's YAML metadata block or Pelican's default metadata format please visit the links below: + +* [Pandoc’s YAML metadata blocks](https://pandoc.org/MANUAL.html#metadata-blocks) +* [Pelican’s default metadata format](https://docs.getpelican.com/en/stable/content.html#file-metadata) + +### Specifying Pandoc Options + +The plugin supports two **mutually exclusive** methods for passing options to Pandoc. + +#### Method One: Via Pelican Settings + +The first method involves configuring two settings in your Pelican settings file (e.g., `pelicanconf.py`): + +* `PANDOC_ARGS` +* `PANDOC_EXTENSIONS` + +In the `PANDOC_ARGS` setting, you may specify any arguments supported by Pandoc, as shown below: + +```python +PANDOC_ARGS = [ + "--mathjax", + "--citeproc", +] +``` + +In the `PANDOC_EXTENSIONS` setting, you may enable/disable any number of the supported [Pandoc extensions](https://pandoc.org/MANUAL.html#extensions): + +```python +PANDOC_EXTENSIONS = [ + "+footnotes", # Enabled extension + "-pipe_tables", # Disabled extension +] +``` + +#### Method Two: Using Pandoc Defaults Files + +The second method involves specifying the path(s) to one or more [Pandoc defaults files][], with all your preferences written in YAML format. + +These paths should be set in your Pelican settings file by using the setting `PANDOC_DEFAULTS_FILES`. The paths may be absolute or relative, but relative paths are recommended as they are more portable. + +```python +PANDOC_DEFAULTS_FILES = [ + "", + "", +] +``` + +Here is a minimal example of content that should be available in a Pandoc defaults file: + +```yaml +reader: markdown +writer: html5 +``` + +Using defaults files has the added benefit of allowing you to use other Markdown variants supported by Pandoc, such as [CommonMark](https://commonmark.org/) and [GitHub-Flavored Markdown](https://docs.github.com/en/free-pro-team@latest/github/writing-on-github). + +Please see [Pandoc defaults files][] for a more complete example. + +> ⚠️ **Note:** Neither method supports the `--standalone` or `--self-contained` arguments, which will yield an error if invoked. + +### Generating a Table of Contents + +If you want to create a table of contents (ToC) for posts or pages, you may do so by specifying the `--toc` or `--table-of-contents` argument in the `PANDOC_ARGS` setting, as shown below: + +```python +PANDOC_ARGS = [ + "--toc", +] +``` + +… or … + +```python +PANDOC_ARGS = [ + "--table-of-contents", +] +``` + +To add a ToC via a Pandoc defaults file, use the syntax below: + +```yaml +table-of-contents: true +``` + +The table of contents will be available for use in templates using the `{{ article.toc }}` or `{{ page.toc }}` Jinja template variables. + +### Enabling Citations + +You may enable citations by specifying the `-C` or `--citeproc` option. + +Set the `PANDOC_ARGS` and `PANDOC_EXTENSIONS` in your Pelican settings file as shown below: + +```python +PANDOC_ARGS = [ + "--citeproc", +] +``` + +… or … + +```python +PANDOC_ARGS = [ + "-C", +] +``` + +If you are using a Pandoc defaults file, you need the following as a bare minimum to enable citations: + +```yaml +reader: markdown +writer: html5 + +citeproc: true +``` + +Without these settings, citations will not be processed by the plugin. + +It is not necessary to specify the `+citations` extension since it is enabled by default. However, if you were to disable citations by specifying `-citations` in `PANDOC_EXTENSIONS` or by setting `reader: markdown-citations` in your defaults file, citations will **not** work. + +You may write your bibliography in any format supported by Pandoc with the appropriate extensions specified. However, you **must** name the bibliography file the same as your post. + +For example, a post with the file name `my-post.md` should have a bibliography file called `my-post.bib`, `my-post.json`, `my-post.yaml` or `my-post.bibtex` in the same directory as your post, or in a subdirectory of the directory that your blog resides in. Failure to do so will prevent the references from being picked up. + +#### Known Issues with Citations + +If enabling citations with a specific style, you need to specify a CSL (Citation Style Language) file, available from the [Zotero Style Repository](https://www.zotero.org/styles). For example, if you are using `ieee-with-url` style file, it may be specified in your Pelican settings file, as shown below: + +```python +PANDOC_ARGS = [ + "--csl=https://www.zotero.org/styles/ieee-with-url", +] +``` + +Or in a Pandoc defaults file: + +```yaml +csl: "https://www.zotero.org/styles/ieee-with-url" +``` + +Specifying a *remote* (that is, not local) CSL file as shown above dramatically increases the time taken to process Markdown content. To improve processing speed, it is _highly_ recommended that you use a local copy of the CSL file downloaded from Zotero. + +You may then reference it in your Pelican settings file as shown below: + +```python +PANDOC_ARGS = [ + "--csl=path/to/file/ieee-with-url.csl", +] +``` + +Or in a Pandoc defaults file: + +```yaml +csl: "path/to/file/ieee-with-url.csl" +``` + +### Calculating and Displaying Reading Time + +This plugin may be used to calculate the estimated reading time of articles and pages by setting `CALCULATE_READING_TIME` to `True` in your Pelican settings file: + +```python +CALCULATE_READING_TIME = True +``` + +You may display the estimated reading time using the `{{ article.reading_time }}` or `{{ page.reading_time }}` template variables. The unit of time will be displayed as “minute” for reading times less than or equal to one minute, or “minutes” for those greater than one minute. + +The reading time is calculated by dividing the number of words by the reading speed, which is the average number words read in a minute. + +The default value for reading speed is set to 200 words per minute, but may be customized by setting `READING_SPEED` to the desired words per minute value in your Pelican settings file: + +```python +READING_SPEED = +``` + +The number of words in a document is calculated using the [Markdown Word Count](https://github.com/gandreadis/markdown-word-count) package. + +### Customizing Path for `pandoc` Executable + +If your `pandoc` executable does not reside on your `PATH`, set the `PANDOC_EXECUTABLE_PATH` in your Pelican settings file to the absolute path of where your `pandoc` resides as shown below: + +```python +PANDOC_EXECUTABLE_PATH = /path/to/my/pandoc +``` + +This setting is useful in cases where the `pandoc` executable from your hosting provider is not recent enough, and you may need to install a version of Pandoc—compatible with this plugin—in a non-standard location. Contributing ------------ -1. Fork it -2. Create your feature branch (`git checkout -b my-new-feature`) -3. Commit your changes (`git commit -am 'Add some feature'`) -4. Push to the branch (`git push origin my-new-feature`) -5. Create new Pull Request +Contributions are welcome and much appreciated. Every little bit helps. You can contribute by improving the documentation, adding missing features, and fixing bugs. You can also help out by reviewing and commenting on [existing issues][]. +To start contributing to this plugin, review the [Contributing to Pelican][] documentation, beginning with the **Contributing Code** section. -[markdown]: http://daringfireball.net/projects/markdown/ -[pandoc]: http://johnmacfarlane.net/pandoc/ -[pelican]: http://getpelican.com -[PyYAML]: http://pyyaml.org/ -[YAML metadata]: http://pandoc.org/README.html#extension-yaml_metadata_block \ No newline at end of file +Special thanks to [Justin Mayer](https://justinmayer.com), [Erwin Janssen](https://github.com/ErwinJanssen), [Joseph Reagle](https://github.com/reagle) and [Deniz Turgut](https://github.com/avaris) for their improvements and feedback on this plugin. + +[existing issues]: https://github.com/pelican-plugins/pandoc-reader/issues +[Contributing to Pelican]: https://docs.getpelican.com/en/latest/contribute.html + +License +------- + +This project is licensed under the AGPL-3.0 license. + +[Pelican]: https://getpelican.com +[Pandoc’s variant of Markdown]: https://pandoc.org/MANUAL.html#pandocs-markdown +[Pandoc defaults files]: https://pandoc.org/MANUAL.html#default-files diff --git a/__init__.py b/__init__.py index 31d0856..124ec0b 100644 --- a/__init__.py +++ b/__init__.py @@ -1 +1,2 @@ -from .pandoc_reader import * +"""Importing pandoc_reader package.""" +from .pandoc_reader import * # NOQA diff --git a/pandoc_reader.py b/pandoc_reader.py index acd072f..877c48f 100644 --- a/pandoc_reader.py +++ b/pandoc_reader.py @@ -1,124 +1,478 @@ -import logging +"""Reader that processes Pandoc Markdown and returns HTML5.""" +import json +import math +import os +import shutil import subprocess +import bs4 +from mwc.counter import count_words_in_markdown +from ruamel.yaml import YAML, constructor + from pelican import signals from pelican.readers import BaseReader from pelican.utils import pelican_open -import os -try: - import yaml -except ImportError: - yaml = None - logging.warning("YAML is not installed; the YAML reader will not work.") +DEFAULT_READING_SPEED = 200 # Words per minute +DEFAULT_PANDOC_EXECUTABLE = "pandoc" +DIR_PATH = os.path.dirname(__file__) +ENCODED_LINKS_TO_RAW_LINKS_MAP = { + "%7Bstatic%7D": "{static}", + "%7Battach%7D": "{attach}", + "%7Bfilename%7D": "{filename}", +} +FILE_EXTENSIONS = ["md", "mkd", "mkdn", "mdwn", "mdown", "markdown", "Rmd"] +PANDOC_READER_HTML_TEMPLATE = "pandoc-reader-default.html" +PANDOC_SUPPORTED_MAJOR_VERSION = 2 +PANDOC_SUPPORTED_MINOR_VERSION = 11 + +TEMPLATES_PATH = os.path.abspath(os.path.join(DIR_PATH, "templates")) +UNSUPPORTED_ARGUMENTS = ("--standalone", "--self-contained") +VALID_BIB_EXTENSIONS = ["json", "yaml", "bibtex", "bib"] + +# Markdown variants supported in defaults files +# Update as Pandoc adds or removes support for formats +VALID_INPUT_FORMATS = ( + "commonmark", + "commonmark_x", + "gfm", + "markdown", + "markdown_mmd", + "markdown_phpextra", + "markdown_strict", +) +VALID_OUTPUT_FORMATS = ("html", "html5") class PandocReader(BaseReader): + """Convert files written in Pandoc Markdown to HTML 5.""" + enabled = True - file_extensions = ['md', 'markdown', 'mkd', 'mdown'] + file_extensions = FILE_EXTENSIONS - def _get_meta_and_content(self, text): - metadata = {} + def read(self, source_path): + """Parse Pandoc Markdown and return HTML5 markup and metadata.""" + # Get the user-defined path to the Pandoc executable or fall back to default + pandoc_executable = self.settings.get( + "PANDOC_EXECUTABLE_PATH", DEFAULT_PANDOC_EXECUTABLE + ) - use_YAML = text[0] == '---' and yaml is not None - if use_YAML: - # Load the data we need to parse - to_parse = [] - for i, line in enumerate(text[1:]): - # When we find a terminator (`---` or `...`), stop. - if line in ('---', '...'): - # Do not include the terminator itself. - break + # If user-defined path, expand and make it absolute in case the path is relative + if pandoc_executable != DEFAULT_PANDOC_EXECUTABLE: + pandoc_executable = os.path.abspath(os.path.expanduser(pandoc_executable)) - # Otherwise, just keep adding the lines to the parseable. - to_parse.append(line) + # Check if pandoc is installed and is executable + if not shutil.which(pandoc_executable): + raise Exception("Could not find Pandoc. Please install.") - parsed = yaml.load("\n".join(to_parse)) + # Check if the version of pandoc installed is 2.11 or higher + self._check_pandoc_version(pandoc_executable) - # Postprocess to make the data usable by Pelican. - for k in parsed: - name, value = k.lower(), parsed[k] - metadata[name] = self.process_metadata(name, value) + # Open Markdown file and read content + content = "" + with pelican_open(source_path) as file_content: + content = file_content - if (not 'summary' in metadata) or (metadata['summary'] is None): - metadata['summary'] = '' - - # Return the text entirely. - content = "\n".join(text) - - else: - for i, line in enumerate(text): - kv = line.split(':', 1) - if len(kv) == 2: - name, value = kv[0].lower(), kv[1].strip() - metadata[name] = self.process_metadata(name, value) - else: - content = "\n".join(text[i:]) - break - - return metadata, content - - def read(self, filename): - with pelican_open(filename) as fp: - text = list(fp.splitlines()) - - metadata, content = self._get_meta_and_content(text) - - filters = self.settings.get('PANDOC_FILTERS', []) - extra_args = self.settings.get('PANDOC_ARGS', []) - extensions = self.settings.get('PANDOC_EXTENSIONS', '') - if isinstance(extensions, list): - extensions = ''.join(extensions) - - pandoc_cmd = ["pandoc", "--from=markdown" + extensions, "--to=html5"] - for filt in filters: - pandoc_cmd.extend(["--filter", filt]) - pandoc_cmd.extend(extra_args) - - bib_dir = self.settings.get('PANDOC_BIBDIR', '') - bib_header = self.settings.get('PANDOC_BIBHEADER', None) - if "bibliography" in metadata.keys(): - bib_file = os.path.join(bib_dir, metadata['bibliography']) - if not os.path.exists(bib_file): - raise FileNotFoundError(bib_file) - bib_args = ['--bibliography={}'.format(bib_file)] - - if bib_header is not None: - bib_args = bib_args + [ - '--metadata=reference-section-title="{}"'.format( - bib_header)] - pandoc_cmd.extend(bib_args) - - if "toc" in metadata.keys(): - if metadata['toc'] == True: - pandoc_cmd.extend(['--toc']) - - proc = subprocess.Popen( - pandoc_cmd, - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - - output, err = proc.communicate(content.encode('utf-8')) - status = proc.wait() - output, err = output.decode('utf-8'), err.decode('utf-8') - if status > 0: - logging.warning(output + err) - - # Make sure we don't lose Pelican template parameters. - output = output.replace('%7Battach%7D', '{attach}')\ - .replace('%7Bfilename%7D', '{filename}')\ - .replace('%7Bstatic%7D', '{static}')\ - .replace('%7Btag%7D', '{tag}')\ - .replace('%7Bcategory%7D', '{category}') + # Retrieve HTML content and metadata + output, metadata = self._create_html(source_path, content, pandoc_executable) return output, metadata + def _create_html(self, source_path, content, pandoc_executable): + """Create HTML5 content.""" + # Get settings set in pelicanconf.py + defaults_files = self.settings.get("PANDOC_DEFAULTS_FILES", []) + + # Adding support for the old defaults file setting + # which will be removed in future versions of this plugin + if self.settings.get("PANDOC_DEFAULT_FILES", []): + defaults_files = self.settings.get("PANDOC_DEFAULT_FILES") + + arguments = self.settings.get("PANDOC_ARGS", []) + extensions = self.settings.get("PANDOC_EXTENSIONS", []) + + if isinstance(extensions, list): + extensions = "".join(extensions) + + # Check if source content has a YAML metadata block + self._check_yaml_metadata_block(content) + + # Check validity of arguments or defaults files + table_of_contents, citations = self._validate_fields( + defaults_files, arguments, extensions + ) + + # Construct preliminary pandoc command + pandoc_cmd = self._construct_pandoc_command( + pandoc_executable, defaults_files, arguments, extensions + ) + + # Find and add bibliography if citations are specified + if citations: + for bib_file in self._find_bibs(source_path): + pandoc_cmd.append("--bibliography={0}".format(bib_file)) + + # Create HTML content using pandoc-reader-default.html template + output = self._run_pandoc(pandoc_cmd, content) + + # Extract table of contents, text and metadata from HTML output + output, toc, pandoc_metadata = self._extract_contents(output, table_of_contents) + + # Replace all occurrences of %7Bstatic%7D to {static}, + # %7Battach%7D to {attach} and %7Bfilename%7D to {filename} + # so that static links are resolvable by pelican + for encoded_str, raw_str in ENCODED_LINKS_TO_RAW_LINKS_MAP.items(): + output = output.replace(encoded_str, raw_str) + + # Parse Pandoc metadata and add it to Pelican + metadata = self._process_metadata(pandoc_metadata) + + if table_of_contents: + # Create table of contents and add to metadata + metadata["toc"] = self.process_metadata("toc", toc) + + if self.settings.get("CALCULATE_READING_TIME", []): + # Calculate reading time and add to metadata + metadata["reading_time"] = self.process_metadata( + "reading_time", self._calculate_reading_time(content) + ) + + return output, metadata + + def _validate_fields(self, defaults_files, arguments, extensions): + """Validate fields and return citations and ToC request values.""" + # If defaults_files is empty then validate the argument and extensions + if not defaults_files: + # Validate the arguments to see that they are supported + # by the plugin + self._check_arguments(arguments) + + # Check if citations have been requested + citations = self._check_if_citations(arguments, extensions) + + # Check if table of contents has been requested + table_of_contents = self._check_if_toc(arguments) + else: + # Validate defaults files and get the citations + # abd table of contents request value + citations, table_of_contents = self._check_defaults(defaults_files) + return table_of_contents, citations + + def _check_defaults(self, defaults_files): + """Check if the given Pandoc defaults file has valid values.""" + citations = False + table_of_contents = False + + # Get the data in all defaults files as a string + defaults_data = "" + for defaults_file in defaults_files: + with open(defaults_file, "r") as file_handle: + for line in file_handle.readlines(): + defaults_data += line + + # Convert YAML data to a Python dictionary + defaults = {} + try: + yaml = YAML() + defaults = yaml.load(defaults_data) + except constructor.DuplicateKeyError as duplicate_key_error: + raise ValueError( + "Duplicate keys defined in multiple defaults files." + ) from duplicate_key_error + + self._check_if_unsupported_settings(defaults) + reader = self._check_input_format(defaults) + self._check_output_format(defaults) + + if not citations: + citeproc_specified = False + + # Cases where citeproc is specified as citeproc: true + if defaults.get("citeproc", ""): + citeproc_specified = True + + # Cases where citeproc is specified in filters + elif "citeproc" in defaults.get("filters", ""): + citeproc_specified = True + + # The extension +citations is enabled by default in Pandoc 2.11 + # we are checking that the extension is not disabled using -citations + if citeproc_specified and "-citations" not in reader: + citations = True + + if not table_of_contents: + if defaults.get("table-of-contents", ""): + table_of_contents = True + + return citations, table_of_contents + + def _calculate_reading_time(self, content): + """Calculate time taken to read content.""" + reading_speed = self.settings.get("READING_SPEED", DEFAULT_READING_SPEED) + wordcount = count_words_in_markdown(content) + + time_unit = "minutes" + try: + reading_time = math.ceil(float(wordcount) / float(reading_speed)) + if reading_time == 1: + time_unit = "minute" + reading_time = "{} {}".format(str(reading_time), time_unit) + except ValueError as words_per_minute_nan: + raise ValueError( + "READING_SPEED setting must be a number." + ) from words_per_minute_nan + + return reading_time + + def _process_metadata(self, pandoc_metadata): + """Process Pandoc metadata and add it to Pelican.""" + # Cycle through the metadata and process them + metadata = {} + for key, value in pandoc_metadata.items(): + key = key.lower() + if value and isinstance(value, str): + value = value.strip().strip('"') + + # Process the metadata + metadata[key] = self.process_metadata(key, value) + return metadata + + @staticmethod + def _check_pandoc_version(pandoc_executable): + """Check that the specified version of Pandoc is 2.11 or higher.""" + output = subprocess.run( + [pandoc_executable, "--version"], + capture_output=True, + encoding="utf-8", + check=True, + ) + + # Returns a string of the form pandoc + pandoc_version = output.stdout.split("\n")[0] + + # Get the major and minor version from the above version string + major_version = pandoc_version.split()[1].split(".")[0] + minor_version = pandoc_version.split()[1].split(".")[1] + + # Pandoc major version less than 2 are not supported + if int(major_version) < PANDOC_SUPPORTED_MAJOR_VERSION: + raise Exception("Pandoc version must be 2.11 or higher.") + + # Pandoc major version 2 minor version less than 11 are not supported + if ( + int(major_version) == PANDOC_SUPPORTED_MAJOR_VERSION + and int(minor_version) < PANDOC_SUPPORTED_MINOR_VERSION + ): + raise Exception("Pandoc version must be 2.11 or higher.") + + @staticmethod + def _check_yaml_metadata_block(content): + """Check if the source content has a YAML metadata block.""" + # Check that the given content is not empty + if not content: + raise Exception("Could not find metadata. File is empty.") + + # Split content into a list of lines + content_lines = content.splitlines() + + # Check that the first line of the file starts with a YAML block + if content_lines[0].rstrip() not in ["---"]: + raise Exception("Could not find metadata header '---'.") + + # Find the end of the YAML block + yaml_block_end = "" + for line_num, line in enumerate(content_lines[1:]): + if line.rstrip() in ["---", "..."]: + yaml_block_end = line_num + break + + # Check if the end of the YAML block was found + if not yaml_block_end: + raise Exception("Could not find end of metadata block.") + + @staticmethod + def _construct_pandoc_command( + pandoc_executable, defaults_files, arguments, extensions + ): + """Construct Pandoc command for content.""" + pandoc_cmd = [ + pandoc_executable, + "--standalone", + "--template={}".format( + os.path.join(TEMPLATES_PATH, PANDOC_READER_HTML_TEMPLATE) + ), + ] + if not defaults_files: + pandoc_cmd.extend(["--from", "markdown" + extensions, "--to", "html5"]) + pandoc_cmd.extend(arguments) + else: + for defaults_file in defaults_files: + pandoc_cmd.append("--defaults={0}".format(defaults_file)) + return pandoc_cmd + + @staticmethod + def _run_pandoc(pandoc_cmd, content): + """Execute the given pandoc command and return output.""" + output = subprocess.run( + pandoc_cmd, + input=content, + capture_output=True, + encoding="utf-8", + check=True, + ) + return output.stdout + + @staticmethod + def _extract_contents(html_output, table_of_contents): + """Extract body html, table of contents and metadata from output.""" + # Extract pandoc metadata from html output + pandoc_json_metadata, _, html_output = html_output.partition("\n") + + # Convert JSON string to dict + pandoc_metadata = json.loads(pandoc_json_metadata) + + # Parse HTML output + soup = bs4.BeautifulSoup(html_output, "html.parser") + + # Extract the table of contents if one was requested + toc = "" + if table_of_contents: + # Find the table of contents + toc = soup.body.find("nav", id="TOC") + + if toc: + # Convert it to a string + toc = str(toc) + + # Replace id=TOC with class="toc" + toc = toc.replace('id="TOC"', 'class="toc"') + + # Remove the table of contents from the HTML output + soup.body.find("nav", id="TOC").decompose() + + # Remove body tag around html output + soup.body.unwrap() + + # Strip leading and trailing spaces + html_output = str(soup).strip() + + return html_output, toc, pandoc_metadata + + @staticmethod + def _check_if_citations(arguments, extensions): + """Check if citations are specified.""" + citations = False + if arguments and extensions: + # The +citations extension is enabled by default in Pandoc 2.11 + # therefore we do a check to see that it is not disabled in extensions + if ( + "--citeproc" in arguments or "-C" in arguments + ) and "-citations" not in extensions: + citations = True + return citations + + @staticmethod + def _check_if_toc(arguments): + """Check if a table of contents should be generated.""" + table_of_contents = False + if arguments: + if "--toc" in arguments or "--table-of-contents" in arguments: + table_of_contents = True + return table_of_contents + + @staticmethod + def _find_bibs(source_path): + """Find bibliographies recursively in the sourcepath given.""" + bib_files = [] + filename = os.path.splitext(os.path.basename(source_path))[0] + directory_path = os.path.dirname(os.path.abspath(source_path)) + for root, _, files in os.walk(directory_path): + for extension in VALID_BIB_EXTENSIONS: + bib_name = ".".join([filename, extension]) + if bib_name in files: + bib_files.append(os.path.join(root, bib_name)) + return bib_files + + @staticmethod + def _check_arguments(arguments): + """Check to see that only supported arguments have been passed.""" + for arg in arguments: + if arg in UNSUPPORTED_ARGUMENTS: + raise ValueError("Argument {0} is not supported.".format(arg)) + + @staticmethod + def _check_if_unsupported_settings(defaults): + """Check if unsupported settings are specified in the defaults.""" + for arg in UNSUPPORTED_ARGUMENTS: + arg = arg[2:] + if defaults.get(arg, ""): + raise ValueError("The default {} should be set to false.".format(arg)) + + @staticmethod + def _check_input_format(defaults): + """Check if the input format given is a Markdown variant.""" + reader = "" + reader_input = defaults.get("reader", "") + from_input = defaults.get("from", "") + + # Case where no input format is specified + if not reader_input and not from_input: + raise ValueError("No input format specified.") + + # Case where both reader and from are specified which is not supported + if reader_input and from_input: + raise ValueError( + ( + "Specifying both from and reader is not supported." + " Please specify just one." + ) + ) + + if reader_input or from_input: + if reader_input: + reader = reader_input + elif from_input: + reader = from_input + + reader_prefix = reader.replace("+", "-").split("-")[0] + + # Check to see if the reader_prefix matches a valid input format + if reader_prefix not in VALID_INPUT_FORMATS: + raise ValueError("Input type has to be a Markdown variant.") + return reader + + @staticmethod + def _check_output_format(defaults): + """Check if the output format is HTML or HTML5.""" + writer_output = defaults.get("writer", "") + to_output = defaults.get("to", "") + + # Case where both writer and to are specified which is not supported + if writer_output and to_output: + raise ValueError( + ( + "Specifying both to and writer is not supported." + " Please specify just one." + ) + ) + + # Case where neither writer nor to value is set to html + if ( + writer_output not in VALID_OUTPUT_FORMATS + and to_output not in VALID_OUTPUT_FORMATS + ): + output_formats = " or ".join(VALID_OUTPUT_FORMATS) + raise ValueError( + "Output format type must be either {}.".format(output_formats) + ) + def add_reader(readers): + """Add the PandocReader as the reader for all Pandoc Markdown files.""" for ext in PandocReader.file_extensions: readers.reader_classes[ext] = PandocReader def register(): + """Register the PandocReader.""" signals.readers_init.connect(add_reader)