Source code for mwtp.parser

from __future__ import annotations

import re
from typing import TYPE_CHECKING, ClassVar, Literal, Mapping, Sequence, TypedDict

from ._alias_record import AliasRecord, NamespaceAlias
from ._namespace_data import NamespaceData
from ._namespace_id_map import NamespaceIDMap
from ._php_to_upper_map import PHP_TO_UPPER_MAP
from ._title_like import TitleLike
from .exceptions import (
    TitleContainsHTMLEntity,
    TitleContainsIllegalCharacter,
    TitleContainsSignatureComponent,
    TitleContainsURLEncodedCharacter,
    TitleHasRelativePathComponent,
    TitleHasSecondLevelNamespace,
    TitleIsBlank,
    TitleIsTooLong,
    TitleStartsWithColon,
)
from .namespace import Namespace
from .title import Title

if TYPE_CHECKING:
    from typing_extensions import NotRequired


class NamespaceDataFromAPI(TypedDict):
    id: int
    case: Literal["first-letter", "case-sensitive"]
    name: str
    subpages: bool
    content: bool
    nonincludable: bool
    canonical: NotRequired[str]
    namespaceprotection: NotRequired[str]
    defaultcontentmodel: NotRequired[str]


class NamespaceDataFromAPIWithAliases(NamespaceDataFromAPI):
    aliases: set[str]


[docs]class Parser: """ A parser that parse strings using (mostly) data provided by the user. """ __slots__ = ("_namespace_data", "_namespace_id_map") _TITLE_MAX_BYTES: ClassVar[int] = 255 _ILLEGAL_TITLE_CHARACTER: ClassVar[re.Pattern[str]] = re.compile( r"""[\u0000-\u001F#<>[\]{|}\u007F\uFFFD]""" ) _TO_UPPER_MAP: ClassVar[dict[int, int]] = PHP_TO_UPPER_MAP _namespace_data: dict[str, NamespaceData] _namespace_id_map: NamespaceIDMap
[docs] def __init__( self, namespace_data: Mapping[str, NamespaceDataFromAPI], alias_entries: Sequence[NamespaceAlias], ) -> None: """ Construct a new parser object from the given data. :param namespace_data: \ A ``Mapping`` that maps string IDs to corresponding namespace data. :param alias_entries: A ``Sequence`` consisting of alias entries. """ self._namespace_data = {} self._namespace_id_map = NamespaceIDMap() alias_record = AliasRecord(alias_entries) self._initialize_data_record(namespace_data, alias_record) self._initialize_namespace_map()
def _initialize_data_record( self, namespace_data: Mapping[str, NamespaceDataFromAPI], alias_record: AliasRecord, ) -> None: """ Convert all dicts in ``namespace_data`` to :class:`_dcs.NamespaceData`. :param namespace_data: The same data passed to :meth:`__init__`. :param alias_record: \ An AliasRecord constructed using :meth:`__init__`'s alias_entries. """ for namespace_id, entry in namespace_data.items(): aliases = alias_record[namespace_id] if aliases: self._namespace_data[namespace_id] = NamespaceData( **entry, aliases=aliases ) else: self._namespace_data[namespace_id] = NamespaceData(**entry) def _initialize_namespace_map(self) -> None: """ Initialize a namespace-name-(alias)-to-ID map from given data. """ for namespace in self._namespace_data.values(): keys_to_be_added = [namespace.name] keys_to_be_added.extend(namespace.aliases) if namespace.canonical: keys_to_be_added.append(namespace.canonical) for key in keys_to_be_added: self._namespace_id_map[key] = namespace.id @property def namespace_data(self) -> dict[str, NamespaceData]: """ The data given to and sanitized by the parser. """ return self._namespace_data
[docs] def parse(self, string: str) -> Title: """ The main parsing method. Raises a subclass of :class:`.InvalidTitle` if the string is not a valid title. :param string: The string to parse. :return: A :class:`Title <.title.Title>`, if parsed successfully. """ title_like = TitleLike(string) title_like.sanitize() if title_like.starts_with(":"): title_like.extract(1) title_like.remove_fragment_if_any() namespace, page_name = self._split_title(title_like) self._validate_characters(page_name) self._validate_page_name_length(TitleLike(page_name), namespace) return self._make_title(page_name, namespace)
def _make_title(self, page_name: str, namespace: int) -> Title: """ Apply the correct casing rule and construct the title object from given data. :param page_name: The page name part of the title. :param namespace: The namespace of the title. :return: The title object. """ corresponding_namespace_data = self._namespace_data[str(namespace)] casing_rule = corresponding_namespace_data.case cased_page_name = self._apply_casing_rule(page_name, casing_rule) return Title(name=cased_page_name, namespace=namespace, parser=self) @staticmethod def _apply_casing_rule(page_name: str, casing_rule: str) -> str: """ Apply the casing rule to the given page name. :param page_name: The page name to be cased. :param casing_rule: The casing rule to be applied. :return: The page name, cased. """ if casing_rule == "case-sensitive": cased_page_name = page_name elif casing_rule == "first-letter": first_character, the_rest = page_name[0], page_name[1:] first_character_code = ord(first_character) if first_character_code not in PHP_TO_UPPER_MAP: uppercased_first_char = first_character.upper() else: uppercased_first_char = first_character.translate(PHP_TO_UPPER_MAP) cased_page_name = uppercased_first_char + the_rest else: raise TypeError(f"Case rule unrecognized: {casing_rule}") return cased_page_name def _split_title(self, title_like: TitleLike) -> tuple[int, str]: """ Split the given title into two parts: namespace and page name. :param title_like: The :class:`TitleLike` object to be split. :return: A tuple consisting of the namespace and the page name. """ if title_like.starts_with(":"): raise TitleStartsWithColon namespace_like, page_name_like = title_like.split_by_first_colon() page_name = page_name_like if namespace_like is not None: namespace_id = self._namespace_id_map[namespace_like] if namespace_id is None: page_name = str(title_like) else: namespace_id = None if page_name == "": raise TitleIsBlank if page_name.startswith(":"): raise TitleStartsWithColon if namespace_id is None: return int(Namespace.MAIN), page_name if namespace_id != Namespace.TALK or ":" not in page_name: return namespace_id, page_name self._validate_second_level_namespace(page_name) return namespace_id, page_name def _validate_second_level_namespace(self, page_name: str) -> None: """ Raise an exception if the given page name starts with a valid namespace. :param page_name: The page name to validate. """ title_like = TitleLike(page_name) second_level_namespace, _ = title_like.split_by_first_colon() if not second_level_namespace: return if second_level_namespace in self._namespace_id_map: raise TitleHasSecondLevelNamespace def _validate_characters(self, page_name: str) -> None: """ Checks if ``page_name`` contains any illegal characters or components. May raise the following exceptions: * :class:`TitleContainsIllegalCharacters` * :class:`TitleContainsURLEncodedCharacters` * :class:`TitleContainsHTMLEntities` * :class:`TitleHasRelativePathComponents` * :class:`TitleContainsSignatureComponents` :param page_name: The page name to validate. """ title_like = TitleLike(page_name) if self._ILLEGAL_TITLE_CHARACTER.search(page_name): raise TitleContainsIllegalCharacter if title_like.contains_url_encoded_character(): raise TitleContainsURLEncodedCharacter if title_like.contains_html_entity_like(): raise TitleContainsHTMLEntity if title_like.has_relative_path_component(): raise TitleHasRelativePathComponent if title_like.contains_signature_component(): raise TitleContainsSignatureComponent def _validate_page_name_length(self, title_like: TitleLike, namespace: int) -> None: """ Raise :class:`TitleIsTooLong <.exceptions.TitleIsTooLong>` if the title is not in ``Special:`` namespace and its length exceeds :attr:`_TITLE_MAX_BYTES`. :param title_like: The :class:`TitleLike` object to be checked. :param namespace: The namespace of the title. """ not_a_special_page = namespace != Namespace.SPECIAL exceeds_max_byte_length = len(title_like) > self._TITLE_MAX_BYTES if not_a_special_page and exceeds_max_byte_length: raise TitleIsTooLong