Source code for easydata.parsers.desc

from abc import ABC
from typing import Any, List, Optional, Union

from easytxt import parse_text

from easydata.parsers.base import BaseData

__all__ = (
    "Description",
    "Sentences",
    "Features",
    "FeaturesDict",
    "Feature",
)


class BaseDescription(BaseData, ABC):
    def __init__(
        self,
        *args,
        language: Optional[str] = None,
        allow: Optional[Union[str, List[str]]] = None,
        callow: Optional[Union[str, List[str]]] = None,
        from_allow: Optional[Union[str, List[str]]] = None,
        from_callow: Optional[Union[str, List[str]]] = None,
        to_allow: Optional[Union[str, List[str]]] = None,
        to_callow: Optional[Union[str, List[str]]] = None,
        deny: Optional[Union[str, List[str]]] = None,
        cdeny: Optional[Union[str, List[str]]] = None,
        normalize: bool = True,
        capitalize: bool = True,
        title: bool = False,
        uppercase: bool = False,
        lowercase: bool = False,
        min_chars: int = 5,
        replace_keys: Optional[list] = None,
        remove_keys: Optional[list] = None,
        replace_keys_raw_text: Optional[list] = None,
        remove_keys_raw_text: Optional[list] = None,
        split_inline_breaks: bool = True,
        inline_breaks: Optional[List[str]] = None,
        merge_sentences: bool = True,
        stop_key: str = ".",
        stop_keys_split: Optional[List[str]] = None,
        stop_keys_ignore: Optional[List[str]] = None,
        sentence_separator: str = " ",
        feature_split_keys: Optional[List[str]] = None,
        text_num_to_numeric: bool = False,
        autodetect_html: bool = True,
        html_text_to_sentences: bool = True,
        css_query: Optional[str] = None,
        exclude_css: Optional[Union[List[str], str]] = None,
        **kwargs,
    ):

        self._allow = allow
        self._callow = callow
        self._from_allow = from_allow
        self._from_callow = from_callow
        self._to_allow = to_allow
        self._to_callow = to_callow
        self._deny = deny
        self._cdeny = cdeny
        self._normalize = normalize
        self._capitalize = capitalize
        self._title = title
        self._uppercase = uppercase
        self._lowercase = lowercase
        self._min_chars = min_chars
        self._replace_keys = replace_keys
        self._remove_keys = remove_keys
        self._replace_keys_raw_text = replace_keys_raw_text
        self._remove_keys_raw_text = remove_keys_raw_text
        self._split_inline_breaks = split_inline_breaks
        self._inline_breaks = inline_breaks
        self._merge_sentences = merge_sentences
        self._stop_key = stop_key
        self._stop_keys_split = stop_keys_split
        self._stop_keys_ignore = stop_keys_ignore
        self._sentence_separator = sentence_separator
        self._feature_split_keys = feature_split_keys
        self._text_num_to_numeric = text_num_to_numeric
        self._autodetect_html = autodetect_html
        self._html_text_to_sentences = html_text_to_sentences
        self._css_query = css_query
        self._exclude_css = exclude_css

        self.__language = language

        super().__init__(
            *args,
            **kwargs,
        )

    @property
    def _language(self):
        return self.__language or self.config.get("ED_LANGUAGE", "en")

    def _get_text_parser(self, text=Any):
        return parse_text(
            text=text,
            language=self._language,
            from_allow=self._from_allow,
            from_callow=self._from_callow,
            to_allow=self._to_allow,
            to_callow=self._to_callow,
            deny=self._deny,
            cdeny=self._cdeny,
            normalize=self._normalize,
            capitalize=self._capitalize,
            title=self._title,
            uppercase=self._uppercase,
            lowercase=self._lowercase,
            min_chars=self._min_chars,
            replace_keys=self._replace_keys,
            remove_keys=self._remove_keys,
            replace_keys_raw_text=self._replace_keys_raw_text,
            remove_keys_raw_text=self._remove_keys_raw_text,
            split_inline_breaks=self._split_inline_breaks,
            inline_breaks=self._inline_breaks,
            merge_sentences=self._merge_sentences,
            stop_key=self._stop_key,
            stop_keys_split=self._stop_keys_split,
            stop_keys_ignore=self._stop_keys_ignore,
            sentence_separator=self._sentence_separator,
            feature_split_keys=self._feature_split_keys,
            text_num_to_numeric=self._text_num_to_numeric,
            autodetect_html=self._autodetect_html,
            html_text_to_sentences=self._html_text_to_sentences,
            css_query=self._css_query,
            exclude_css=self._exclude_css,
        )


[docs]class Description(BaseDescription): def parse_value( self, value: Any, data: Any, ) -> Optional[str]: if not value: return None return self._get_text_parser(value).text or None
[docs]class Sentences(BaseDescription): def parse_value( self, value: Any, data: Any, ) -> Optional[list]: if not value: return None return self._get_text_parser(value).sentences
[docs]class Features(BaseDescription): def parse_value( self, value: Any, data: Any, ) -> Optional[list]: if not value: return None return self._get_text_parser(value).features
[docs]class FeaturesDict(BaseDescription): def parse_value( self, value: Any, data: Any, ) -> Optional[list]: if not value: return None return self._get_text_parser(value).features_dict
[docs]class Feature(BaseDescription): def __init__( self, *args, key: Optional[str] = None, key_exact: Optional[str] = None, **kwargs, ): if not key and not key_exact: raise AttributeError("feature attr key or key_exact must be provided!") self._key = key self._key_exact = key_exact super().__init__( *args, **kwargs, ) def parse_value(self, value: Any, data: Any) -> Optional[list]: if not value: return None text_parser = self._get_text_parser(value) if self._key_exact: return text_parser.feature_exact(self._key_exact) return text_parser.feature(self._key)