Source code for tokenize_uk.tokenize_uk

#!env python
# -*- coding: utf-8 -*-

"""Ukrainian tokenization script based on `standard tokenization algorithm <https://github.com/lang-uk/ner-uk/blob/master/doc/tokenization.md>`_.

2016 (c) Vsevolod Dyomkin <vseloved@gmail.com>, Dmitry Chaplinsky <chaplinsky.dmitry@gmail.com>
"""

from __future__ import unicode_literals
import re
import six


ACCENT = six.unichr(769)
WORD_TOKENIZATION_RULES = re.compile(r"""
[\w""" + ACCENT + """]+://(?:[a-zA-Z]|[0-9]|[$-_@.&+])+
|[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+.[a-zA-Z0-9-.]+
|[0-9]+-[а-яА-ЯіїІЇ'’`""" + ACCENT + """]+
|[+-]?[0-9](?:[0-9,.-]*[0-9])?
|[\w""" + ACCENT + """](?:[\w'’`-""" + ACCENT + """]?[\w""" + ACCENT + """]+)*
|[\w""" + ACCENT + """].(?:\[\w""" + ACCENT + """].)+[\w""" + ACCENT + """]?
|["#$%&*+,/:;<=>@^`~…\\(\\)⟨⟩{}\[\|\]‒–—―«»“”‘’'№]
|[.!?]+
|-+
""", re.X | re.U)

ABBRS = """
ім.
о.
вул.
просп.
бул.
пров.
пл.
г.
р.
див.
п.
с.
м.
""".strip().split()


[docs]def tokenize_words(string): """ Tokenize input text to words. :param string: Text to tokenize :type string: str or unicode :return: words :rtype: list of strings """ string = six.text_type(string) return re.findall(WORD_TOKENIZATION_RULES, string)
[docs]def tokenize_sents(string): """ Tokenize input text to sentences. :param string: Text to tokenize :type string: str or unicode :return: sentences :rtype: list of strings """ string = six.text_type(string) spans = [] for match in re.finditer('[^\s]+', string): spans.append(match) spans_count = len(spans) rez = [] off = 0 for i in range(spans_count): tok = string[spans[i].start():spans[i].end()] if i == spans_count - 1: rez.append(string[off:spans[i].end()]) elif tok[-1] in ['.', '!', '?', '…', '»']: tok1 = tok[re.search('[.!?…»]', tok).start()-1] next_tok = string[spans[i + 1].start():spans[i + 1].end()] if (next_tok[0].isupper() and not tok1.isupper() and not (tok[-1] != '.' or tok1[0] == '(' or tok in ABBRS)): rez.append(string[off:spans[i].end()]) off = spans[i + 1].start() return rez
[docs]def tokenize_text(string): """ Tokenize input text to paragraphs, sentences and words. Tokenization to paragraphs is done using simple Newline algorithm For sentences and words tokenizers above are used :param string: Text to tokenize :type string: str or unicode :return: text, tokenized into paragraphs, sentences and words :rtype: list of list of list of words """ string = six.text_type(string) rez = [] for part in string.split('\n'): par = [] for sent in tokenize_sents(part): par.append(tokenize_words(sent)) if par: rez.append(par) return rez
__all__ = [ "tokenize_words", "tokenize_text", "tokenize_sents"]