Source code for panflute.tools

"""
Useful (but not essential) functions for writing panflute filters
"""


# ---------------------------
# Imports
# ---------------------------

from .base import Element
from .elements import *
from .io import dump

import io
import os
import os.path as p
import re
import sys
import json
import yaml
import shlex
from typing import Tuple

from shutil import which
from subprocess import Popen, PIPE
from functools import partial

# yamlloader keeps dict ordering in yaml
try:
    import yamlloader
except ImportError:
    yamlloader = None

if yamlloader is None:
    # property of pyyaml:
    # C*Loader when compiled with C, else fallback to pure Python loader
    try:
        from yaml import CSafeLoader as Loader
    except ImportError:
        from yaml import SafeLoader as Loader
else:
    from yamlloader.ordereddict import CSafeLoader as Loader

# to be filled when the first time which('pandoc') is called
PANDOC_PATH = None


# ---------------------------
# Constants
# ---------------------------

HorizontalSpaces = (Space, LineBreak, SoftBreak)

VerticalSpaces = (Para, )


# ---------------------------
# Convenience classes
# ---------------------------


[docs]class PandocVersion: ''' Get runtime Pandoc version use PandocVersion().version for comparing versions ''' def __init__(self): pass def __str__(self) -> str: return self._repr.splitlines()[0].split(' ')[1] def __repr__(self) -> str: return self._repr @property def _repr(self): # lazily call pandoc only once if not hasattr(self, '__repr'): self.__repr: str = run_pandoc(args=['--version']) return self.__repr @property def version(self) -> Tuple[int, ...]: return tuple(int(i) for i in str(self).split('.')) @property def data_dir(self): info = self._repr.splitlines() prefix = "User data directory: " info = [row for row in info if row.startswith(prefix)] assert len(info) == 1, info data_dir = info[0][len(prefix):] # data_dir might contain multiple folders: # Default user data directory: /home/runner/.local/share/pandoc or /home/runner/.pandoc/filters data_dir = data_dir.split(' or ') data_dir = [p.normpath(p.expanduser(p.expandvars(p.join(d, 'filters')))) for d in data_dir] return data_dir
pandoc_version = PandocVersion() # --------------------------- # Convenience functions # ---------------------------
[docs]def yaml_filter(element, doc, tag=None, function=None, tags=None, strict_yaml=False): ''' Convenience function for parsing code blocks with YAML options This function is useful to create a filter that applies to code blocks that have specific classes. It is used as an argument of ``run_filter``, with two additional options: ``tag`` and ``function``. Using this is equivalent to having filter functions that: 1. Check if the element is a code block 2. Check if the element belongs to a specific class 3. Split the YAML options (at the beginning of the block, by looking for ``...`` or ``---`` strings in a separate line 4. Parse the YAML 5. Use the YAML options and (optionally) the data that follows the YAML to return a new or modified element Instead, you just need to: 1. Call ``run_filter`` with ``yaml_filter`` as the action function, and with the additional arguments ``tag`` and ``function`` 2. Construct a ``fenced_action`` function that takes four arguments: (options, data, element, doc). Note that options is a dict and data is a raw string. Notice that this is similar to the ``action`` functions of standard filters, but with *options* and *data* as the new ones. Note: if you want to apply multiple functions to separate classes, you can use the ``tags`` argument, which receives a dict of ``tag: function`` pairs. Note: use the ``strict_yaml=True`` option in order to allow for more verbose but flexible YAML metadata: more than one YAML blocks are allowed, but they all must start with ``---`` (even at the beginning) and end with ``---`` or ``...``. Also, YAML is not the default content when no delimiters are set. Example:: """ Replace code blocks of class 'foo' with # horizontal rules """ import panflute as pf def fenced_action(options, data, element, doc): count = options.get('count', 1) div = pf.Div(attributes={'count': str(count)}) div.content.extend([pf.HorizontalRule] * count) return div if __name__ == '__main__': pf.run_filter(pf.yaml_filter, tag='foo', function=fenced_action) ''' # Allow for either tag+function or a dict {tag: function} assert (tag is None) + (tags is None) == 1 # XOR if tags is None: tags = {tag: function} if type(element) == CodeBlock: for tag in tags: if tag in element.classes: function = tags[tag] if not strict_yaml: # Split YAML and data parts (separated by ... or ---) raw = re.split("^([.]{3,}|[-]{3,})$", element.text, 1, re.MULTILINE) data = raw[2] if len(raw) > 2 else '' data = data.lstrip('\n') raw = raw[0] try: options = yaml.load(raw, Loader=Loader) # nosec # already using SafeLoader except (yaml.scanner.ScannerError, yaml.parser.ParserError): debug("panflute: malformed YAML block:") debug(repr(raw)) return if options is None: options = {} else: options = {} data = [] raw = re.split("^([.]{3,}|[-]{3,})$", element.text, 0, re.MULTILINE) rawmode = True for chunk in raw: chunk = chunk.strip('\n') if not chunk: continue if rawmode: if chunk.startswith('---'): rawmode = False else: data.append(chunk) else: if chunk.startswith('---') or chunk.startswith('...'): rawmode = True else: try: options.update(yaml.load(chunk, Loader=Loader)) # nosec # already using SafeLoader except yaml.scanner.ScannerError: debug("panflute: malformed YAML block") return data = '\n'.join(data) return function(options=options, data=data, element=element, doc=doc)
# --------------------------- # Functions that extract content # ---------------------------
[docs]def stringify(element, newlines=True): """ Return the raw text version of an element (and its children elements). Example: >>> from panflute import * >>> e1 = Emph(Str('Hello'), Space, Str('world!')) >>> e2 = Strong(Str('Bye!')) >>> para = Para(e1, Space, e2) >>> stringify(para) 'Hello world! Bye!\n\n' :param newlines: add a new line after a paragraph (default True) :type newlines: :class:`bool` :rtype: :class:`str` """ def stop_if(e): return isinstance(e, (DefinitionList, Cite)) def attach_str(e, doc, answer): if hasattr(e, 'text'): ans = e.text elif isinstance(e, HorizontalSpaces): ans = ' ' elif isinstance(e, VerticalSpaces) and newlines: ans = '\n\n' elif type(e) == DefinitionList: ans = [] for item in e.content: term = ''.join(stringify(part) for part in item.term) definitions = '; '.join(stringify(defn) for defn in item.definitions) ans.append(f'- {term}: {definitions}') ans = '\n'.join(ans) elif type(e) == Cite: ans = stringify(e.content) else: ans = '' # Add quotes around the contents of Quoted() if type(e.parent) == Quoted: if e.index == 0: ans = '"' + ans if e.index == len(e.container) - 1: ans += '"' answer.append(ans) answer = [] f = partial(attach_str, answer=answer) element.walk(f, stop_if=stop_if) return ''.join(answer)
def _get_metadata(self, key='', default=None, builtin=True): """ get_metadata([key, default, simple]) Retrieve metadata with nested keys separated by dots. This is useful to avoid repeatedly checking if a dict exists, as the frontmatter might not have the keys that we expect. With ``builtin=True`` (the default), it will convert the results to built-in Python types, instead of :class:`.MetaValue` elements. EG: instead of returning a MetaBool it will return True|False. :param key: string with the keys separated by a dot (``key1.key2``). Default is an empty string (which returns the entire metadata dict) :type key: ``str`` :param default: return value in case the key is not found (default is ``None``) :param builtin: If True, return built-in Python types (default is ``True``) :Example: >>> doc.metadata['format']['show-frame'] = True >>> # ... >>> # afterwards: >>> show_frame = doc.get_metadata('format.show-frame', False) >>> stata_path = doc.get_metadata('media.path.figures', '.') """ # Retrieve metadata assert isinstance(key, str) meta = self.metadata # Retrieve specific key if key: for k in key.split('.'): if isinstance(meta, MetaMap) and k in meta.content: meta = meta[k] else: return default # Stringify contents return meta2builtin(meta) if builtin else meta def meta2builtin(meta): if isinstance(meta, MetaBool): return meta.boolean elif isinstance(meta, MetaString): return meta.text elif isinstance(meta, MetaList): return [meta2builtin(v) for v in meta.content.list] elif isinstance(meta, MetaMap): return {k: meta2builtin(v) for k, v in meta.content.dict.items()} elif isinstance(meta, (MetaInlines, MetaBlocks)): return stringify(meta) else: debug("MISSING", type(meta)) return meta # Bind the method Doc.get_metadata = _get_metadata # --------------------------- # Functions that rely on external calls # ---------------------------
[docs]def shell(args, wait=True, msg=None): """ Execute the external command and get its exitcode, stdout and stderr. """ # Fix Windows error if passed a string if isinstance(args, str): args = shlex.split(args, posix=(os.name != "nt")) if os.name == "nt": args = [arg.replace('/', '\\') for arg in args] if wait: proc = Popen(args, stdin=PIPE, stdout=PIPE, stderr=PIPE) out, err = proc.communicate(input=msg) exitcode = proc.returncode if exitcode != 0: debug('<<<< shell call failed; error message below >>>>') debug(err.decode('utf-8')) debug('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>') raise IOError() return out else: DETACHED_PROCESS = 0x00000008 proc = Popen(args, creationflags=DETACHED_PROCESS)
[docs]def run_pandoc(text='', args=None, pandoc_path=None): """ Low level function that calls Pandoc with (optionally) some input text and/or arguments :param str pandoc_path: If specified, use the Pandoc at this path. If None, default to that from PATH. """ if args is None: args = [] if pandoc_path is None: # initialize the global PANDOC_PATH if PANDOC_PATH is None: temp = which('pandoc') if temp is None: raise OSError("Path to pandoc executable does not exists") sys.modules[__name__].PANDOC_PATH = temp pandoc_path = PANDOC_PATH try: proc = Popen([pandoc_path] + args, stdin=PIPE, stdout=PIPE, stderr=PIPE) except FileNotFoundError: raise OSError(f"Given pandoc_path {pandoc_path} is invalid") out, err = proc.communicate(input=text.encode('utf-8')) exitcode = proc.returncode if err: debug(err.decode('utf-8')) if exitcode != 0: raise IOError('') return out.decode('utf-8')
[docs]def convert_text(text, input_format='markdown', output_format='panflute', standalone=False, extra_args=None, pandoc_path=None): r""" Convert formatted text (usually markdown) by calling Pandoc internally The default output format ('panflute') will return a tree of Pandoc elements. When combined with 'standalone=True', the tree root will be a 'Doc' element. Example: >>> from panflute import * >>> md = 'Some *markdown* **text** ~xyz~' >>> tex = r'Some $x^y$ or $x_n = \sqrt{a + b}$ \textit{a}' >>> convert_text(md) [Para(Str(Some) Space Emph(Str(markdown)) Space Strong(Str(text)) Space Subscript(Str(xyz)))] >>> convert_text(tex) [Para(Str(Some) Space Math(x^y; format='InlineMath') Space Str(or) Space Math(x_n = \sqrt{a + b}; format='InlineMath') Space RawInline(\textit{a}; format='tex'))] :param text: text that will be converted :type text: :class:`str` | :class:`.Element` | :class:`list` of :class:`.Element` :param input_format: format of the text (default 'markdown'). Any Pandoc input format is valid, plus 'panflute' (a tree of Pandoc elements) :param output_format: format of the output (default is 'panflute' which creates the tree of Pandoc elements). Non-binary Pandoc formats are allowed (e.g. markdown, latex is allowed, but docx and pdf are not). :param standalone: whether the results will be a standalone document or not. :type standalone: :class:`bool` :param extra_args: extra arguments passed to Pandoc :type extra_args: :class:`list` :param str pandoc_path: If specified, use the Pandoc at this path. If None, default to that from PATH. :rtype: :class:`list` | :class:`.Doc` | :class:`str` Note: for a more general solution, see `pyandoc <https://github.com/kennethreitz/pyandoc/>`_ by Kenneth Reitz. """ if input_format == 'panflute': # Problem: # We need a Doc element, but received a list of elements. # So we wrap-up the list in a Doc, but with what pandoc-api version? # (remember that Pandoc requires a matching api-version!) # Workaround: call Pandoc with empty text to get its api-version if not isinstance(text, Doc): tmp_doc = convert_text('', standalone=True) api_version = tmp_doc.api_version if isinstance(text, Element): text = [text] text = Doc(*text, api_version=api_version) # Dump the Doc into json with io.StringIO() as f: dump(text, f) text = f.getvalue() in_fmt = 'json' if input_format == 'panflute' else input_format out_fmt = 'json' if output_format == 'panflute' else output_format if extra_args is None: extra_args = [] if standalone: extra_args.append('--standalone') out = inner_convert_text(text, in_fmt, out_fmt, extra_args, pandoc_path=pandoc_path) if output_format == 'panflute': out = json.loads(out, object_hook=from_json) if standalone: if not isinstance(out, Doc): # Pandoc 1.7.2 and earlier metadata, items = out out = Doc(*items, metadata=metadata) else: if isinstance(out, Doc): # Pandoc 1.8 and later out = out.content.list else: out = out[1] # Pandoc 1.7.2 and earlier return out
def inner_convert_text(text, input_format, output_format, extra_args, pandoc_path=None): # like convert_text(), but does not support 'panflute' input/output from_arg = '--from={}'.format(input_format) to_arg = '--to={}'.format(output_format) args = [from_arg, to_arg] + extra_args out = run_pandoc(text, args, pandoc_path=pandoc_path) out = "\n".join(out.splitlines()) # Replace \r\n with \n return out # --------------------------- # Functions that modify content # --------------------------- def _replace_keyword(self, keyword, replacement, count=0): """ replace_keyword(keyword, replacement[, count]) Walk through the element and its children and look for Str() objects that contains exactly the keyword. Then, replace it. Usually applied to an entire document (a :class:`.Doc` element) Note: If the replacement is a block, it cannot be put in place of a Str element. As a solution, the closest ancestor (e.g. the parent) will be replaced instead, but only if possible (if the parent only has one child). Example: >>> from panflute import * >>> p1 = Para(Str('Spam'), Space, Emph(Str('and'), Space, Str('eggs'))) >>> p2 = Para(Str('eggs')) >>> p3 = Plain(Emph(Str('eggs'))) >>> doc = Doc(p1, p2, p3) >>> doc.content ListContainer(Para(Str(Spam) Space Emph(Str(and) Space Str(eggs))) Para(Str(eggs)) Plain(Emph(Str(eggs)))) >>> doc.replace_keyword('eggs', Str('ham')) >>> doc.content ListContainer(Para(Str(Spam) Space Emph(Str(and) Space Str(ham))) Para(Str(ham)) Plain(Emph(Str(ham)))) >>> doc.replace_keyword(keyword='ham', replacement=Para(Str('spam'))) >>> doc.content ListContainer(Para(Str(Spam) Space Emph(Str(and) Space Str(ham))) Para(Str(spam)) Para(Str(spam))) :param keyword: string that will be searched (cannot have spaces!) :type keyword: :class:`str` :param replacement: element that will be placed in turn of the ``Str`` element that contains the keyword. :type replacement: :class:`.Element` :param count: number of occurrences that will be replaced. If count is not given or is set to zero, all occurrences will be replaced. :type count: :class:`int` """ def replace_with_inline(e, doc): if type(e) == Str and e.text == keyword: doc.num_matches += 1 if not count or doc.num_matches <= count: return replacement def replace_with_block(e, doc): ''' It's difficult to replace a keyword with an entire Block element. This is because the keyword is of type Str (an Inline) and the parent object of a Str can only contain Inlines and not Blocks (e.g. Para can contain Inlines, not Divs) Implications: 1) If the Str that contains the keyword is inside another Inline instead of a Block (e.g. Div -> Emph -> Str) then we have to do a trick: when .walk() touches an Emph that contains Str(keyword), it replaces the Emph with Str(keyword). 2) If the element that contains the Str(keyword) has multiple children, then we are in a bind as replacing it will destroy information. Thus, we can't do do it 3) If the element that contains the Str(keyword) does so in a DictContainer instead of a ListContainer, then we cannot retrieve the "first and only element" easily, so we also abort (happens with metadata elements). ''' # Here we can check that e.content is ListContainer (i.e. not DictContainer) # or check that e is not a Metavalue ("not isinstance(e, MetaValue)") if hasattr(e, 'content') and isinstance(e.content, ListContainer) and len(e.content) == 1: ee = e.content[0] if type(ee) == Str and ee.text == keyword: if isinstance(e, Block): doc.num_matches += 1 if not count or doc.num_matches <= count: return replacement elif isinstance(e, Inline): return Str(keyword) else: pass # not implemented doc = self.doc if doc is None: raise Exception('No root document') doc.num_matches = 0 if isinstance(replacement, Inline): return self.walk(replace_with_inline, doc) elif isinstance(replacement, Block): return self.walk(replace_with_block, doc) else: raise NotImplementedError(type(replacement)) # Bind the method Element.replace_keyword = _replace_keyword
[docs]def get_option(options=None, local_tag=None, doc=None, doc_tag=None, default=None, error_on_none=True): """ Fetch an option variable from either a local (element) level option/attribute tag, a document level metadata tag, or a default. :type options: ``dict`` :type local_tag: ``str`` :type doc: :class:`Doc` :type doc_tag: ``str`` :type default: ``any`` :type error_on_none: ``bool`` The order of preference is local > document > default, although if a local or document tag returns None, then the next level down is used. Also, if error_on_none=True and the final variable is None, then a ValueError will be raised In this manner you can set global variables, which can be optionally overridden at a local level. For example, the two files below show how to apply different styles to docx text: **main.md:** .. code-block:: none :linenos: ------------------ style-div: name: MyStyle ------------------ :::style some text ::: ::: {.style name=MyOtherStyle} some more text ::: **style_filter.py:** .. code-block:: python :linenos: import panflute as pf def action(elem, doc): if type(elem) == pf.Div: style = pf.get_option(elem.attributes, "name", doc, "style-div.name") elem.attributes["custom-style"] = style def main(doc=None): return run_filter(action, doc=doc) if __name__ == "__main__": main() """ variable = None # element level if options is not None and local_tag is not None: if local_tag in options and options[local_tag] is not None: variable = options[local_tag] if variable is not None: return variable # doc level if doc is not None and doc_tag is not None: variable = doc.get_metadata(doc_tag, None) if variable is not None: return variable # default level variable = default if variable is None and error_on_none: raise ValueError("could not retrieve a value for tag; local={0}, doc={1}".format(local_tag, doc_tag)) return variable