from __future__ import annotations
import re
from typing import TYPE_CHECKING, ClassVar, Literal, Mapping, Sequence, TypedDict
from ._alias_record import AliasRecord, NamespaceAlias
from ._namespace_data import NamespaceData
from ._namespace_id_map import NamespaceIDMap
from ._php_to_upper_map import PHP_TO_UPPER_MAP
from ._title_like import TitleLike
from .exceptions import (
TitleContainsHTMLEntity,
TitleContainsIllegalCharacter,
TitleContainsSignatureComponent,
TitleContainsURLEncodedCharacter,
TitleHasRelativePathComponent,
TitleHasSecondLevelNamespace,
TitleIsBlank,
TitleIsTooLong,
TitleStartsWithColon,
)
from .namespace import Namespace
from .title import Title
if TYPE_CHECKING:
from typing_extensions import NotRequired
class NamespaceDataFromAPI(TypedDict):
id: int
case: Literal["first-letter", "case-sensitive"]
name: str
subpages: bool
content: bool
nonincludable: bool
canonical: NotRequired[str]
namespaceprotection: NotRequired[str]
defaultcontentmodel: NotRequired[str]
class NamespaceDataFromAPIWithAliases(NamespaceDataFromAPI):
aliases: set[str]
[docs]class Parser:
"""
A parser that parse strings using
(mostly) data provided by the user.
"""
__slots__ = ("_namespace_data", "_namespace_id_map")
_TITLE_MAX_BYTES: ClassVar[int] = 255
_ILLEGAL_TITLE_CHARACTER: ClassVar[re.Pattern[str]] = re.compile(
r"""[\u0000-\u001F#<>[\]{|}\u007F\uFFFD]"""
)
_TO_UPPER_MAP: ClassVar[dict[int, int]] = PHP_TO_UPPER_MAP
_namespace_data: dict[str, NamespaceData]
_namespace_id_map: NamespaceIDMap
[docs] def __init__(
self,
namespace_data: Mapping[str, NamespaceDataFromAPI],
alias_entries: Sequence[NamespaceAlias],
) -> None:
"""
Construct a new parser object from the given data.
:param namespace_data: \
A ``Mapping`` that maps string IDs to corresponding namespace data.
:param alias_entries: A ``Sequence`` consisting of alias entries.
"""
self._namespace_data = {}
self._namespace_id_map = NamespaceIDMap()
alias_record = AliasRecord(alias_entries)
self._initialize_data_record(namespace_data, alias_record)
self._initialize_namespace_map()
def _initialize_data_record(
self,
namespace_data: Mapping[str, NamespaceDataFromAPI],
alias_record: AliasRecord,
) -> None:
"""
Convert all dicts in ``namespace_data`` to
:class:`_dcs.NamespaceData`.
:param namespace_data: The same data passed to :meth:`__init__`.
:param alias_record: \
An AliasRecord constructed
using :meth:`__init__`'s alias_entries.
"""
for namespace_id, entry in namespace_data.items():
aliases = alias_record[namespace_id]
if aliases:
self._namespace_data[namespace_id] = NamespaceData(
**entry, aliases=aliases
)
else:
self._namespace_data[namespace_id] = NamespaceData(**entry)
def _initialize_namespace_map(self) -> None:
"""
Initialize a namespace-name-(alias)-to-ID map from given data.
"""
for namespace in self._namespace_data.values():
keys_to_be_added = [namespace.name]
keys_to_be_added.extend(namespace.aliases)
if namespace.canonical:
keys_to_be_added.append(namespace.canonical)
for key in keys_to_be_added:
self._namespace_id_map[key] = namespace.id
@property
def namespace_data(self) -> dict[str, NamespaceData]:
"""
The data given to and sanitized by the parser.
"""
return self._namespace_data
[docs] def parse(self, string: str) -> Title:
"""
The main parsing method. Raises a subclass of
:class:`.InvalidTitle` if the string is not
a valid title.
:param string: The string to parse.
:return: A :class:`Title <.title.Title>`, if parsed successfully.
"""
title_like = TitleLike(string)
title_like.sanitize()
if title_like.starts_with(":"):
title_like.extract(1)
title_like.remove_fragment_if_any()
namespace, page_name = self._split_title(title_like)
self._validate_characters(page_name)
self._validate_page_name_length(TitleLike(page_name), namespace)
return self._make_title(page_name, namespace)
def _make_title(self, page_name: str, namespace: int) -> Title:
"""
Apply the correct casing rule and construct
the title object from given data.
:param page_name: The page name part of the title.
:param namespace: The namespace of the title.
:return: The title object.
"""
corresponding_namespace_data = self._namespace_data[str(namespace)]
casing_rule = corresponding_namespace_data.case
cased_page_name = self._apply_casing_rule(page_name, casing_rule)
return Title(name=cased_page_name, namespace=namespace, parser=self)
@staticmethod
def _apply_casing_rule(page_name: str, casing_rule: str) -> str:
"""
Apply the casing rule to the given page name.
:param page_name: The page name to be cased.
:param casing_rule: The casing rule to be applied.
:return: The page name, cased.
"""
if casing_rule == "case-sensitive":
cased_page_name = page_name
elif casing_rule == "first-letter":
first_character, the_rest = page_name[0], page_name[1:]
first_character_code = ord(first_character)
if first_character_code not in PHP_TO_UPPER_MAP:
uppercased_first_char = first_character.upper()
else:
uppercased_first_char = first_character.translate(PHP_TO_UPPER_MAP)
cased_page_name = uppercased_first_char + the_rest
else:
raise TypeError(f"Case rule unrecognized: {casing_rule}")
return cased_page_name
def _split_title(self, title_like: TitleLike) -> tuple[int, str]:
"""
Split the given title into two parts: namespace and page name.
:param title_like: The :class:`TitleLike` object to be split.
:return: A tuple consisting of the namespace and the page name.
"""
if title_like.starts_with(":"):
raise TitleStartsWithColon
namespace_like, page_name_like = title_like.split_by_first_colon()
page_name = page_name_like
if namespace_like is not None:
namespace_id = self._namespace_id_map[namespace_like]
if namespace_id is None:
page_name = str(title_like)
else:
namespace_id = None
if page_name == "":
raise TitleIsBlank
if page_name.startswith(":"):
raise TitleStartsWithColon
if namespace_id is None:
return int(Namespace.MAIN), page_name
if namespace_id != Namespace.TALK or ":" not in page_name:
return namespace_id, page_name
self._validate_second_level_namespace(page_name)
return namespace_id, page_name
def _validate_second_level_namespace(self, page_name: str) -> None:
"""
Raise an exception if the given page name
starts with a valid namespace.
:param page_name: The page name to validate.
"""
title_like = TitleLike(page_name)
second_level_namespace, _ = title_like.split_by_first_colon()
if not second_level_namespace:
return
if second_level_namespace in self._namespace_id_map:
raise TitleHasSecondLevelNamespace
def _validate_characters(self, page_name: str) -> None:
"""
Checks if ``page_name`` contains any illegal characters
or components. May raise the following exceptions:
* :class:`TitleContainsIllegalCharacters`
* :class:`TitleContainsURLEncodedCharacters`
* :class:`TitleContainsHTMLEntities`
* :class:`TitleHasRelativePathComponents`
* :class:`TitleContainsSignatureComponents`
:param page_name: The page name to validate.
"""
title_like = TitleLike(page_name)
if self._ILLEGAL_TITLE_CHARACTER.search(page_name):
raise TitleContainsIllegalCharacter
if title_like.contains_url_encoded_character():
raise TitleContainsURLEncodedCharacter
if title_like.contains_html_entity_like():
raise TitleContainsHTMLEntity
if title_like.has_relative_path_component():
raise TitleHasRelativePathComponent
if title_like.contains_signature_component():
raise TitleContainsSignatureComponent
def _validate_page_name_length(self, title_like: TitleLike, namespace: int) -> None:
"""
Raise :class:`TitleIsTooLong <.exceptions.TitleIsTooLong>`
if the title is not in ``Special:`` namespace and
its length exceeds :attr:`_TITLE_MAX_BYTES`.
:param title_like: The :class:`TitleLike` object to be checked.
:param namespace: The namespace of the title.
"""
not_a_special_page = namespace != Namespace.SPECIAL
exceeds_max_byte_length = len(title_like) > self._TITLE_MAX_BYTES
if not_a_special_page and exceeds_max_byte_length:
raise TitleIsTooLong