import argparse import json from abc import abstractmethod from collections.abc import Mapping, Sequence from pathlib import Path from typing import Any, cast, ClassVar, Generic, get_args, NamedTuple, Optional, Union from xml.sax.saxutils import escape, quoteattr import markdown_it from markdown_it.token import Token from . import md, options from .docbook import DocBookRenderer, Heading from .manual_structure import check_structure, FragmentType, is_include, TocEntryType from .md import Converter class BaseConverter(Converter[md.TR], Generic[md.TR]): # per-converter configuration for ns:arg=value arguments to include blocks, following # the include type. html converters need something like this to support chunking, or # another external method like the chunktocs docbook uses (but block options seem like # a much nicer of doing this). INCLUDE_ARGS_NS: ClassVar[str] INCLUDE_FRAGMENT_ALLOWED_ARGS: ClassVar[set[str]] = set() INCLUDE_OPTIONS_ALLOWED_ARGS: ClassVar[set[str]] = set() _base_paths: list[Path] _current_type: list[TocEntryType] def convert(self, file: Path) -> str: self._base_paths = [ file ] self._current_type = ['book'] try: with open(file, 'r') as f: return self._render(f.read()) except Exception as e: raise RuntimeError(f"failed to render manual {file}") from e def _parse(self, src: str) -> list[Token]: tokens = super()._parse(src) check_structure(self._current_type[-1], tokens) for token in tokens: if not is_include(token): continue directive = token.info[12:].split() if not directive: continue args = { k: v for k, _sep, v in map(lambda s: s.partition('='), directive[1:]) } typ = directive[0] if typ == 'options': token.type = 'included_options' self._process_include_args(token, args, self.INCLUDE_OPTIONS_ALLOWED_ARGS) self._parse_options(token, args) else: fragment_type = typ.removesuffix('s') if fragment_type not in get_args(FragmentType): raise RuntimeError(f"unsupported structural include type '{typ}'") self._current_type.append(cast(FragmentType, fragment_type)) token.type = 'included_' + typ self._process_include_args(token, args, self.INCLUDE_FRAGMENT_ALLOWED_ARGS) self._parse_included_blocks(token, args) self._current_type.pop() return tokens def _process_include_args(self, token: Token, args: dict[str, str], allowed: set[str]) -> None: ns = self.INCLUDE_ARGS_NS + ":" args = { k[len(ns):]: v for k, v in args.items() if k.startswith(ns) } if unknown := set(args.keys()) - allowed: assert token.map raise RuntimeError(f"unrecognized include argument in line {token.map[0] + 1}", unknown) token.meta['include-args'] = args def _parse_included_blocks(self, token: Token, block_args: dict[str, str]) -> None: assert token.map included = token.meta['included'] = [] for (lnum, line) in enumerate(token.content.splitlines(), token.map[0] + 2): line = line.strip() path = self._base_paths[-1].parent / line if path in self._base_paths: raise RuntimeError(f"circular include found in line {lnum}") try: self._base_paths.append(path) with open(path, 'r') as f: tokens = self._parse(f.read()) included.append((tokens, path)) self._base_paths.pop() except Exception as e: raise RuntimeError(f"processing included file {path} from line {lnum}") from e def _parse_options(self, token: Token, block_args: dict[str, str]) -> None: assert token.map items = {} for (lnum, line) in enumerate(token.content.splitlines(), token.map[0] + 2): if len(args := line.split(":", 1)) != 2: raise RuntimeError(f"options directive with no argument in line {lnum}") (k, v) = (args[0].strip(), args[1].strip()) if k in items: raise RuntimeError(f"duplicate options directive {k} in line {lnum}") items[k] = v try: id_prefix = items.pop('id-prefix') varlist_id = items.pop('list-id') source = items.pop('source') except KeyError as e: raise RuntimeError(f"options directive {e} missing in block at line {token.map[0] + 1}") if items.keys(): raise RuntimeError( f"unsupported options directives in block at line {token.map[0] + 1}", " ".join(items.keys())) try: with open(self._base_paths[-1].parent / source, 'r') as f: token.meta['id-prefix'] = id_prefix token.meta['list-id'] = varlist_id token.meta['source'] = json.load(f) except Exception as e: raise RuntimeError(f"processing options block in line {token.map[0] + 1}") from e class ManualDocBookRenderer(DocBookRenderer): _toplevel_tag: str _revision: str def __init__(self, toplevel_tag: str, revision: str, manpage_urls: Mapping[str, str]): super().__init__(manpage_urls) self._toplevel_tag = toplevel_tag self._revision = revision self.rules |= { 'included_sections': lambda *args: self._included_thing("section", *args), 'included_chapters': lambda *args: self._included_thing("chapter", *args), 'included_preface': lambda *args: self._included_thing("preface", *args), 'included_parts': lambda *args: self._included_thing("part", *args), 'included_appendix': lambda *args: self._included_thing("appendix", *args), 'included_options': self.included_options, } def render(self, tokens: Sequence[Token]) -> str: # books get special handling because they have *two* title tags. doing this with # generic code is more complicated than it's worth. the checks above have verified # that both titles actually exist. if self._toplevel_tag == 'book': assert tokens[1].children assert tokens[4].children if (maybe_id := cast(str, tokens[0].attrs.get('id', ""))): maybe_id = "xml:id=" + quoteattr(maybe_id) return (f'' f' {self.renderInline(tokens[1].children)}' f' {self.renderInline(tokens[4].children)}' f' {super().render(tokens[6:])}' f'') return super().render(tokens) def _heading_tag(self, token: Token, tokens: Sequence[Token], i: int) -> tuple[str, dict[str, str]]: (tag, attrs) = super()._heading_tag(token, tokens, i) # render() has already verified that we don't have supernumerary headings and since the # book tag is handled specially we can leave the check this simple if token.tag != 'h1': return (tag, attrs) return (self._toplevel_tag, attrs | { 'xmlns': "http://docbook.org/ns/docbook", 'xmlns:xlink': "http://www.w3.org/1999/xlink", }) def _included_thing(self, tag: str, token: Token, tokens: Sequence[Token], i: int) -> str: result = [] # close existing partintro. the generic render doesn't really need this because # it doesn't have a concept of structure in the way the manual does. if self._headings and self._headings[-1] == Heading('part', 1): result.append("") self._headings[-1] = self._headings[-1]._replace(partintro_closed=True) # must nest properly for structural includes. this requires saving at least # the headings stack, but creating new renderers is cheap and much easier. r = ManualDocBookRenderer(tag, self._revision, self._manpage_urls) for (included, path) in token.meta['included']: try: result.append(r.render(included)) except Exception as e: raise RuntimeError(f"rendering {path}") from e return "".join(result) def included_options(self, token: Token, tokens: Sequence[Token], i: int) -> str: conv = options.DocBookConverter(self._manpage_urls, self._revision, False, 'fragment', token.meta['list-id'], token.meta['id-prefix']) conv.add_options(token.meta['source']) return conv.finalize(fragment=True) # TODO minimize docbook diffs with existing conversions. remove soon. def paragraph_open(self, token: Token, tokens: Sequence[Token], i: int) -> str: return super().paragraph_open(token, tokens, i) + "\n " def paragraph_close(self, token: Token, tokens: Sequence[Token], i: int) -> str: return "\n" + super().paragraph_close(token, tokens, i) def code_block(self, token: Token, tokens: Sequence[Token], i: int) -> str: return f"\n{escape(token.content)}" def fence(self, token: Token, tokens: Sequence[Token], i: int) -> str: info = f" language={quoteattr(token.info)}" if token.info != "" else "" return f"\n{escape(token.content)}" class DocBookConverter(BaseConverter[ManualDocBookRenderer]): INCLUDE_ARGS_NS = "docbook" def __init__(self, manpage_urls: Mapping[str, str], revision: str): super().__init__() self._renderer = ManualDocBookRenderer('book', revision, manpage_urls) def _build_cli_db(p: argparse.ArgumentParser) -> None: p.add_argument('--manpage-urls', required=True) p.add_argument('--revision', required=True) p.add_argument('infile', type=Path) p.add_argument('outfile', type=Path) def _run_cli_db(args: argparse.Namespace) -> None: with open(args.manpage_urls, 'r') as manpage_urls: md = DocBookConverter(json.load(manpage_urls), args.revision) converted = md.convert(args.infile) args.outfile.write_text(converted) def build_cli(p: argparse.ArgumentParser) -> None: formats = p.add_subparsers(dest='format', required=True) _build_cli_db(formats.add_parser('docbook')) def run_cli(args: argparse.Namespace) -> None: if args.format == 'docbook': _run_cli_db(args) else: raise RuntimeError('format not hooked up', args)