"""Reader that processes Pandoc Markdown and returns HTML5.""" import json import math import os import shutil import subprocess import bs4 from mwc.counter import count_words_in_markdown from ruamel.yaml import YAML, constructor from pelican import signals from pelican.readers import BaseReader from pelican.utils import pelican_open DEFAULT_READING_SPEED = 200 # Words per minute DEFAULT_PANDOC_EXECUTABLE = "pandoc" DIR_PATH = os.path.dirname(__file__) ENCODED_LINKS_TO_RAW_LINKS_MAP = { "%7Bstatic%7D": "{static}", "%7Battach%7D": "{attach}", "%7Bfilename%7D": "{filename}", } FILE_EXTENSIONS = ["md", "mkd", "mkdn", "mdwn", "mdown", "markdown", "Rmd"] PANDOC_READER_HTML_TEMPLATE = "pandoc-reader-default.html" PANDOC_SUPPORTED_MAJOR_VERSION = 2 PANDOC_SUPPORTED_MINOR_VERSION = 11 TEMPLATES_PATH = os.path.abspath(os.path.join(DIR_PATH, "templates")) UNSUPPORTED_ARGUMENTS = ("--standalone", "--self-contained") VALID_BIB_EXTENSIONS = ["json", "yaml", "bibtex", "bib"] # Markdown variants supported in defaults files # Update as Pandoc adds or removes support for formats VALID_INPUT_FORMATS = ( "commonmark", "commonmark_x", "gfm", "markdown", "markdown_mmd", "markdown_phpextra", "markdown_strict", ) VALID_OUTPUT_FORMATS = ("html", "html5") class PandocReader(BaseReader): """Convert files written in Pandoc Markdown to HTML 5.""" enabled = True file_extensions = FILE_EXTENSIONS def read(self, source_path): """Parse Pandoc Markdown and return HTML5 markup and metadata.""" # Get the user-defined path to the Pandoc executable or fall back to default pandoc_executable = self.settings.get( "PANDOC_EXECUTABLE_PATH", DEFAULT_PANDOC_EXECUTABLE ) # If user-defined path, expand and make it absolute in case the path is relative if pandoc_executable != DEFAULT_PANDOC_EXECUTABLE: pandoc_executable = os.path.abspath(os.path.expanduser(pandoc_executable)) # Check if pandoc is installed and is executable if not shutil.which(pandoc_executable): raise Exception("Could not find Pandoc. Please install.") # Check if the version of pandoc installed is 2.11 or higher self._check_pandoc_version(pandoc_executable) # Open Markdown file and read content content = "" with pelican_open(source_path) as file_content: content = file_content # Retrieve HTML content and metadata output, metadata = self._create_html(source_path, content, pandoc_executable) return output, metadata def _create_html(self, source_path, content, pandoc_executable): """Create HTML5 content.""" # Get settings set in pelicanconf.py defaults_files = self.settings.get("PANDOC_DEFAULTS_FILES", []) # Adding support for the old defaults file setting # which will be removed in future versions of this plugin if self.settings.get("PANDOC_DEFAULT_FILES", []): defaults_files = self.settings.get("PANDOC_DEFAULT_FILES") arguments = self.settings.get("PANDOC_ARGS", []) extensions = self.settings.get("PANDOC_EXTENSIONS", []) if isinstance(extensions, list): extensions = "".join(extensions) # Check if source content has a YAML metadata block self._check_yaml_metadata_block(content) # Check validity of arguments or defaults files table_of_contents, citations = self._validate_fields( defaults_files, arguments, extensions ) # Construct preliminary pandoc command pandoc_cmd = self._construct_pandoc_command( pandoc_executable, defaults_files, arguments, extensions ) # Find and add bibliography if citations are specified if citations: for bib_file in self._find_bibs(source_path): pandoc_cmd.append("--bibliography={0}".format(bib_file)) # Create HTML content using pandoc-reader-default.html template output = self._run_pandoc(pandoc_cmd, content) # Extract table of contents, text and metadata from HTML output output, toc, pandoc_metadata = self._extract_contents(output, table_of_contents) # Replace all occurrences of %7Bstatic%7D to {static}, # %7Battach%7D to {attach} and %7Bfilename%7D to {filename} # so that static links are resolvable by pelican for encoded_str, raw_str in ENCODED_LINKS_TO_RAW_LINKS_MAP.items(): output = output.replace(encoded_str, raw_str) # Parse Pandoc metadata and add it to Pelican metadata = self._process_metadata(pandoc_metadata) if table_of_contents: # Create table of contents and add to metadata metadata["toc"] = self.process_metadata("toc", toc) if self.settings.get("CALCULATE_READING_TIME", []): # Calculate reading time and add to metadata metadata["reading_time"] = self.process_metadata( "reading_time", self._calculate_reading_time(content) ) return output, metadata def _validate_fields(self, defaults_files, arguments, extensions): """Validate fields and return citations and ToC request values.""" # If defaults_files is empty then validate the argument and extensions if not defaults_files: # Validate the arguments to see that they are supported # by the plugin self._check_arguments(arguments) # Check if citations have been requested citations = self._check_if_citations(arguments, extensions) # Check if table of contents has been requested table_of_contents = self._check_if_toc(arguments) else: # Validate defaults files and get the citations # abd table of contents request value citations, table_of_contents = self._check_defaults(defaults_files) return table_of_contents, citations def _check_defaults(self, defaults_files): """Check if the given Pandoc defaults file has valid values.""" citations = False table_of_contents = False # Get the data in all defaults files as a string defaults_data = "" for defaults_file in defaults_files: with open(defaults_file, "r") as file_handle: for line in file_handle.readlines(): defaults_data += line # Convert YAML data to a Python dictionary defaults = {} try: yaml = YAML() defaults = yaml.load(defaults_data) except constructor.DuplicateKeyError as duplicate_key_error: raise ValueError( "Duplicate keys defined in multiple defaults files." ) from duplicate_key_error self._check_if_unsupported_settings(defaults) reader = self._check_input_format(defaults) self._check_output_format(defaults) if not citations: citeproc_specified = False # Cases where citeproc is specified as citeproc: true if defaults.get("citeproc", ""): citeproc_specified = True # Cases where citeproc is specified in filters elif "citeproc" in defaults.get("filters", ""): citeproc_specified = True # The extension +citations is enabled by default in Pandoc 2.11 # we are checking that the extension is not disabled using -citations if citeproc_specified and "-citations" not in reader: citations = True if not table_of_contents: if defaults.get("table-of-contents", ""): table_of_contents = True return citations, table_of_contents def _calculate_reading_time(self, content): """Calculate time taken to read content.""" reading_speed = self.settings.get("READING_SPEED", DEFAULT_READING_SPEED) wordcount = count_words_in_markdown(content) time_unit = "minutes" try: reading_time = math.ceil(float(wordcount) / float(reading_speed)) if reading_time == 1: time_unit = "minute" reading_time = "{} {}".format(str(reading_time), time_unit) except ValueError as words_per_minute_nan: raise ValueError( "READING_SPEED setting must be a number." ) from words_per_minute_nan return reading_time def _process_metadata(self, pandoc_metadata): """Process Pandoc metadata and add it to Pelican.""" # Cycle through the metadata and process them metadata = {} for key, value in pandoc_metadata.items(): key = key.lower() if value and isinstance(value, str): value = value.strip().strip('"') # Process the metadata metadata[key] = self.process_metadata(key, value) return metadata @staticmethod def _check_pandoc_version(pandoc_executable): """Check that the specified version of Pandoc is 2.11 or higher.""" output = subprocess.run( [pandoc_executable, "--version"], capture_output=True, encoding="utf-8", check=True, ) # Returns a string of the form pandoc pandoc_version = output.stdout.split("\n")[0] # Get the major and minor version from the above version string major_version = pandoc_version.split()[1].split(".")[0] minor_version = pandoc_version.split()[1].split(".")[1] # Pandoc major version less than 2 are not supported if int(major_version) < PANDOC_SUPPORTED_MAJOR_VERSION: raise Exception("Pandoc version must be 2.11 or higher.") # Pandoc major version 2 minor version less than 11 are not supported if ( int(major_version) == PANDOC_SUPPORTED_MAJOR_VERSION and int(minor_version) < PANDOC_SUPPORTED_MINOR_VERSION ): raise Exception("Pandoc version must be 2.11 or higher.") @staticmethod def _check_yaml_metadata_block(content): """Check if the source content has a YAML metadata block.""" # Check that the given content is not empty if not content: raise Exception("Could not find metadata. File is empty.") # Split content into a list of lines content_lines = content.splitlines() # Check that the first line of the file starts with a YAML block if content_lines[0].rstrip() not in ["---"]: raise Exception("Could not find metadata header '---'.") # Find the end of the YAML block yaml_block_end = "" for line_num, line in enumerate(content_lines[1:]): if line.rstrip() in ["---", "..."]: yaml_block_end = line_num break # Check if the end of the YAML block was found if not yaml_block_end: raise Exception("Could not find end of metadata block.") @staticmethod def _construct_pandoc_command( pandoc_executable, defaults_files, arguments, extensions ): """Construct Pandoc command for content.""" pandoc_cmd = [ pandoc_executable, "--standalone", "--template={}".format( os.path.join(TEMPLATES_PATH, PANDOC_READER_HTML_TEMPLATE) ), ] if not defaults_files: pandoc_cmd.extend(["--from", "markdown" + extensions, "--to", "html5"]) pandoc_cmd.extend(arguments) else: for defaults_file in defaults_files: pandoc_cmd.append("--defaults={0}".format(defaults_file)) return pandoc_cmd @staticmethod def _run_pandoc(pandoc_cmd, content): """Execute the given pandoc command and return output.""" output = subprocess.run( pandoc_cmd, input=content, capture_output=True, encoding="utf-8", check=True, ) return output.stdout @staticmethod def _extract_contents(html_output, table_of_contents): """Extract body html, table of contents and metadata from output.""" # Extract pandoc metadata from html output pandoc_json_metadata, _, html_output = html_output.partition("\n") # Convert JSON string to dict pandoc_metadata = json.loads(pandoc_json_metadata) # Parse HTML output soup = bs4.BeautifulSoup(html_output, "html.parser") # Extract the table of contents if one was requested toc = "" if table_of_contents: # Find the table of contents toc = soup.body.find("nav", id="TOC") if toc: # Convert it to a string toc = str(toc) # Replace id=TOC with class="toc" toc = toc.replace('id="TOC"', 'class="toc"') # Remove the table of contents from the HTML output soup.body.find("nav", id="TOC").decompose() # Remove body tag around html output soup.body.unwrap() # Strip leading and trailing spaces html_output = str(soup).strip() return html_output, toc, pandoc_metadata @staticmethod def _check_if_citations(arguments, extensions): """Check if citations are specified.""" citations = False if arguments and extensions: # The +citations extension is enabled by default in Pandoc 2.11 # therefore we do a check to see that it is not disabled in extensions if ( "--citeproc" in arguments or "-C" in arguments ) and "-citations" not in extensions: citations = True return citations @staticmethod def _check_if_toc(arguments): """Check if a table of contents should be generated.""" table_of_contents = False if arguments: if "--toc" in arguments or "--table-of-contents" in arguments: table_of_contents = True return table_of_contents @staticmethod def _find_bibs(source_path): """Find bibliographies recursively in the sourcepath given.""" bib_files = [] filename = os.path.splitext(os.path.basename(source_path))[0] directory_path = os.path.dirname(os.path.abspath(source_path)) for root, _, files in os.walk(directory_path): for extension in VALID_BIB_EXTENSIONS: bib_name = ".".join([filename, extension]) if bib_name in files: bib_files.append(os.path.join(root, bib_name)) return bib_files @staticmethod def _check_arguments(arguments): """Check to see that only supported arguments have been passed.""" for arg in arguments: if arg in UNSUPPORTED_ARGUMENTS: raise ValueError("Argument {0} is not supported.".format(arg)) @staticmethod def _check_if_unsupported_settings(defaults): """Check if unsupported settings are specified in the defaults.""" for arg in UNSUPPORTED_ARGUMENTS: arg = arg[2:] if defaults.get(arg, ""): raise ValueError("The default {} should be set to false.".format(arg)) @staticmethod def _check_input_format(defaults): """Check if the input format given is a Markdown variant.""" reader = "" reader_input = defaults.get("reader", "") from_input = defaults.get("from", "") # Case where no input format is specified if not reader_input and not from_input: raise ValueError("No input format specified.") # Case where both reader and from are specified which is not supported if reader_input and from_input: raise ValueError( ( "Specifying both from and reader is not supported." " Please specify just one." ) ) if reader_input or from_input: if reader_input: reader = reader_input elif from_input: reader = from_input reader_prefix = reader.replace("+", "-").split("-")[0] # Check to see if the reader_prefix matches a valid input format if reader_prefix not in VALID_INPUT_FORMATS: raise ValueError("Input type has to be a Markdown variant.") return reader @staticmethod def _check_output_format(defaults): """Check if the output format is HTML or HTML5.""" writer_output = defaults.get("writer", "") to_output = defaults.get("to", "") # Case where both writer and to are specified which is not supported if writer_output and to_output: raise ValueError( ( "Specifying both to and writer is not supported." " Please specify just one." ) ) # Case where neither writer nor to value is set to html if ( writer_output not in VALID_OUTPUT_FORMATS and to_output not in VALID_OUTPUT_FORMATS ): output_formats = " or ".join(VALID_OUTPUT_FORMATS) raise ValueError( "Output format type must be either {}.".format(output_formats) ) def add_reader(readers): """Add the PandocReader as the reader for all Pandoc Markdown files.""" for ext in PandocReader.file_extensions: readers.reader_classes[ext] = PandocReader def register(): """Register the PandocReader.""" signals.readers_init.connect(add_reader)