Source code for tokenize_uk.tokenize_uk
#!env python
# -*- coding: utf-8 -*-
"""Ukrainian tokenization script based on `standard tokenization algorithm <https://github.com/lang-uk/ner-uk/blob/master/doc/tokenization.md>`_.
2016 (c) Vsevolod Dyomkin <vseloved@gmail.com>, Dmitry Chaplinsky <chaplinsky.dmitry@gmail.com>
"""
from __future__ import unicode_literals
import re
import six
ACCENT = six.unichr(769)
WORD_TOKENIZATION_RULES = re.compile(r"""
[\w""" + ACCENT + """]+://(?:[a-zA-Z]|[0-9]|[$-_@.&+])+
|[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+.[a-zA-Z0-9-.]+
|[0-9]+-[а-яА-ЯіїІЇ'’`""" + ACCENT + """]+
|[+-]?[0-9](?:[0-9,.-]*[0-9])?
|[\w""" + ACCENT + """](?:[\w'’`-""" + ACCENT + """]?[\w""" + ACCENT + """]+)*
|[\w""" + ACCENT + """].(?:\[\w""" + ACCENT + """].)+[\w""" + ACCENT + """]?
|["#$%&*+,/:;<=>@^`~…\\(\\)⟨⟩{}\[\|\]‒–—―«»“”‘’'№]
|[.!?]+
|-+
""", re.X | re.U)
ABBRS = """
ім.
о.
вул.
просп.
бул.
пров.
пл.
г.
р.
див.
п.
с.
м.
""".strip().split()
[docs]def tokenize_words(string):
"""
Tokenize input text to words.
:param string: Text to tokenize
:type string: str or unicode
:return: words
:rtype: list of strings
"""
string = six.text_type(string)
return re.findall(WORD_TOKENIZATION_RULES, string)
[docs]def tokenize_sents(string):
"""
Tokenize input text to sentences.
:param string: Text to tokenize
:type string: str or unicode
:return: sentences
:rtype: list of strings
"""
string = six.text_type(string)
spans = []
for match in re.finditer('[^\s]+', string):
spans.append(match)
spans_count = len(spans)
rez = []
off = 0
for i in range(spans_count):
tok = string[spans[i].start():spans[i].end()]
if i == spans_count - 1:
rez.append(string[off:spans[i].end()])
elif tok[-1] in ['.', '!', '?', '…', '»']:
tok1 = tok[re.search('[.!?…»]', tok).start()-1]
next_tok = string[spans[i + 1].start():spans[i + 1].end()]
if (next_tok[0].isupper()
and not tok1.isupper()
and not (tok[-1] != '.'
or tok1[0] == '('
or tok in ABBRS)):
rez.append(string[off:spans[i].end()])
off = spans[i + 1].start()
return rez
[docs]def tokenize_text(string):
"""
Tokenize input text to paragraphs, sentences and words.
Tokenization to paragraphs is done using simple Newline algorithm
For sentences and words tokenizers above are used
:param string: Text to tokenize
:type string: str or unicode
:return: text, tokenized into paragraphs, sentences and words
:rtype: list of list of list of words
"""
string = six.text_type(string)
rez = []
for part in string.split('\n'):
par = []
for sent in tokenize_sents(part):
par.append(tokenize_words(sent))
if par:
rez.append(par)
return rez
__all__ = [
"tokenize_words", "tokenize_text", "tokenize_sents"]