spaCy/spacy/orth.pyx

# -*- coding: utf8 -*-
from __future__ import unicode_literals
import unicodedata

# If your license is not GPL compatible, use text_unidecode. But if your code
# is, you should use the unidecode library, because its performance is better.
# spaCy does not list unidecode as a dependency, in case your license is not
# GPL compatible.
try:
    from unidecode import unidecode
except ImportError:
    from text_unidecode import unidecode


import re

import math


TAGS = 'adj adp adv conj det noun num pdt pos pron prt punct verb'.upper().split()


# Binary string features
cpdef bint is_alpha(unicode string):
    return string.isalpha()


cpdef bint is_digit(unicode string):
    return string.isdigit()


cpdef bint is_punct(unicode string):
    for c in string:
        if not unicodedata.category(c).startswith('P'):
            return False
    else:
        return True


cpdef bint is_space(unicode string):
    return string.isspace()


cpdef bint is_ascii(unicode string):
    for c in string:
        if ord(c) >= 128:
            return False
    else:
        return True


cpdef bint is_title(unicode string):
    return string.istitle()


cpdef bint is_lower(unicode string):
    return string.islower()


cpdef bint is_upper(unicode string):
    return string.isupper()


TLDs = set("com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|"
        "name|pro|tel|travel|xxx|"
        "ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|"
        "bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|"
        "co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|"
        "fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|"
        "hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|"
        "km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|"
        "mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|"
        "nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|"
        "sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|ss|st|su|sv|sy|sz|tc|td|tf|tg|th|"
        "tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|"
        "wf|ws|ye|yt|za|zm|zw".split('|'))


cpdef bint like_url(unicode string):
    # We're looking for things that function in text like URLs. So, valid URL
    # or not, anything they say http:// is going to be good.
    if string.startswith('http://') or string.startswith('https://'):
        return True
    elif string.startswith('www.') and len(string) >= 5:
        return True
    # No dots? Not URLish enough
    if string[0] == '.' or string[-1] == '.':
        return False
    # This should be a call to "in", but PyPy lacks this function?
    cdef int i
    for i in range(len(string)):
        if string[i] == '.':
            break
    else:
        return False
    tld = string.rsplit('.', 1)[1].split(':', 1)[0]
    if tld.endswith('/'):
        return True

    if tld.isalpha() and tld in TLDs:
        return True
    return False


# TODO: This should live in the language.orth
NUM_WORDS = set('zero one two three four five six seven eight nine ten'
                'eleven twelve thirteen fourteen fifteen sixteen seventeen'
                'eighteen nineteen twenty thirty forty fifty sixty seventy'
                'eighty ninety hundred thousand million billion trillion'
                'quadrillion gajillion bazillion'.split())
cpdef bint like_number(unicode string):
    string = string.replace(',', '')
    string = string.replace('.', '')
    if string.isdigit():
        return True
    if string.count('/') == 1:
        num, denom = string.split('/')
        if like_number(num) and like_number(denom):
            return True
    if string in NUM_WORDS:
        return True
    return False


_like_email = re.compile(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)").match
cpdef bint like_email(unicode string):
    return _like_email(string)


cpdef unicode word_shape(unicode string):
    if len(string) >= 100:
        return 'LONG'
    length = len(string)
    shape = []
    last = ""
    shape_char = ""
    seq = 0
    for c in string:
        if c.isalpha():
            if c.isupper():
                shape_char = "X"
            else:
                shape_char = "x"
        elif c.isdigit():
            shape_char = "d"
        else:
            shape_char = c
        if shape_char == last:
            seq += 1
        else:
            seq = 0
            last = shape_char
        if seq < 4:
            shape.append(shape_char)
    return ''.join(shape)


cpdef unicode norm1(unicode string, lower_pc=0.0, upper_pc=0.0, title_pc=0.0):
    """Apply level 1 normalization:

    * Case is canonicalized, using frequency statistics
    * Unicode mapped to ascii, via unidecode
    * Regional spelling variations are normalized
    """
    pass


cpdef bytes asciied(unicode string):
    stripped = unidecode(string)
    if not stripped:
        return b'???'
    return stripped.encode('ascii')


# Exceptions --- do not convert these
_uk_us_except = set([
    'our',
    'ours',
    'four',
    'fours',
    'your',
    'yours',
    'hour',
    'hours',
    'course',
    'rise',
])
def uk_to_usa(unicode string):
    if not string.islower():
        return string
    if string in _uk_us_except:
        return string
    our = re.compile(r'ours?$')
    string = our.sub('or', string)

    return string
* Add asciify string transform, and other bits. 2014-09-02 01:25:28 +04:00			`# -- coding: utf8 --`
* Pass tests. Need to implement more feature functions. 2014-08-30 22:36:06 +04:00			`from __future__ import unicode_literals`
* Add asciify string transform, and other bits. 2014-09-02 01:25:28 +04:00			`import unicodedata`
* Fix issue #112: Replace unidecode with text-unidecode, to avoid license problems. 2015-09-28 16:39:37 +03:00
			`# If your license is not GPL compatible, use text_unidecode. But if your code`
			`# is, you should use the unidecode library, because its performance is better.`
			`# spaCy does not list unidecode as a dependency, in case your license is not`
			`# GPL compatible.`
			`try:`
			`from unidecode import unidecode`
			`except ImportError:`
			`from text_unidecode import unidecode`

* Fix type declaration in asciied function 2015-10-09 05:46:57 +03:00
* Improve efficiency of tagger, and improve morphological processing 2014-12-09 17:02:04 +03:00			`import re`
* Add asciify string transform, and other bits. 2014-09-02 01:25:28 +04:00
			`import math`
* Pass tests. Need to implement more feature functions. 2014-08-30 22:36:06 +04:00
* Refactoring to use Tokens object 2014-09-10 20:11:13 +04:00
			`TAGS = 'adj adp adv conj det noun num pdt pos pron prt punct verb'.upper().split()`


* Add orth features 2014-08-30 21:01:00 +04:00			`# Binary string features`
* Make PyPy work 2015-01-05 09:54:13 +03:00			`cpdef bint is_alpha(unicode string):`
* Add asciify string transform, and other bits. 2014-09-02 01:25:28 +04:00			`return string.isalpha()`

* Add orth features 2014-08-30 21:01:00 +04:00
* Make PyPy work 2015-01-05 09:54:13 +03:00			`cpdef bint is_digit(unicode string):`
* Add asciify string transform, and other bits. 2014-09-02 01:25:28 +04:00			`return string.isdigit()`

* Add orth features 2014-08-30 21:01:00 +04:00
* Make PyPy work 2015-01-05 09:54:13 +03:00			`cpdef bint is_punct(unicode string):`
* Add asciify string transform, and other bits. 2014-09-02 01:25:28 +04:00			`for c in string:`
* Bug fixes to flag features 2014-09-02 01:41:31 +04:00			`if not unicodedata.category(c).startswith('P'):`
			`return False`
* Add asciify string transform, and other bits. 2014-09-02 01:25:28 +04:00			`else:`
* Bug fixes to flag features 2014-09-02 01:41:31 +04:00			`return True`
* Add asciify string transform, and other bits. 2014-09-02 01:25:28 +04:00
* Add orth features 2014-08-30 21:01:00 +04:00
* Make PyPy work 2015-01-05 09:54:13 +03:00			`cpdef bint is_space(unicode string):`
* Add asciify string transform, and other bits. 2014-09-02 01:25:28 +04:00			`return string.isspace()`

* Add orth features 2014-08-30 21:01:00 +04:00
* Make PyPy work 2015-01-05 09:54:13 +03:00			`cpdef bint is_ascii(unicode string):`
* Add asciify string transform, and other bits. 2014-09-02 01:25:28 +04:00			`for c in string:`
* Bug fixes to flag features 2014-09-02 01:41:31 +04:00			`if ord(c) >= 128:`
* Add asciify string transform, and other bits. 2014-09-02 01:25:28 +04:00			`return False`
			`else:`
			`return True`

* Add orth features 2014-08-30 21:01:00 +04:00
* Make PyPy work 2015-01-05 09:54:13 +03:00			`cpdef bint is_title(unicode string):`
* Add asciify string transform, and other bits. 2014-09-02 01:25:28 +04:00			`return string.istitle()`

* Add orth features 2014-08-30 21:01:00 +04:00
* Make PyPy work 2015-01-05 09:54:13 +03:00			`cpdef bint is_lower(unicode string):`
* Add asciify string transform, and other bits. 2014-09-02 01:25:28 +04:00			`return string.islower()`

* Add orth features 2014-08-30 21:01:00 +04:00
* Make PyPy work 2015-01-05 09:54:13 +03:00			`cpdef bint is_upper(unicode string):`
* Add asciify string transform, and other bits. 2014-09-02 01:25:28 +04:00			`return string.isupper()`
* Add orth features 2014-08-30 21:01:00 +04:00
* Implement is_number 2014-11-01 11:13:24 +03:00
* Add is_urlish function 2014-11-01 09:39:34 +03:00			`TLDs = set("com\|org\|edu\|gov\|net\|mil\|aero\|asia\|biz\|cat\|coop\|info\|int\|jobs\|mobi\|museum\|"`
			`"name\|pro\|tel\|travel\|xxx\|"`
			`"ac\|ad\|ae\|af\|ag\|ai\|al\|am\|an\|ao\|aq\|ar\|as\|at\|au\|aw\|ax\|az\|ba\|bb\|bd\|be\|bf\|bg\|"`
			`"bh\|bi\|bj\|bm\|bn\|bo\|br\|bs\|bt\|bv\|bw\|by\|bz\|ca\|cc\|cd\|cf\|cg\|ch\|ci\|ck\|cl\|cm\|cn\|"`
			`"co\|cr\|cs\|cu\|cv\|cx\|cy\|cz\|dd\|de\|dj\|dk\|dm\|do\|dz\|ec\|ee\|eg\|eh\|er\|es\|et\|eu\|fi\|"`
			`"fj\|fk\|fm\|fo\|fr\|ga\|gb\|gd\|ge\|gf\|gg\|gh\|gi\|gl\|gm\|gn\|gp\|gq\|gr\|gs\|gt\|gu\|gw\|gy\|"`
			`"hk\|hm\|hn\|hr\|ht\|hu\|id\|ie\|il\|im\|in\|io\|iq\|ir\|is\|it\|je\|jm\|jo\|jp\|ke\|kg\|kh\|ki\|"`
			`"km\|kn\|kp\|kr\|kw\|ky\|kz\|la\|lb\|lc\|li\|lk\|lr\|ls\|lt\|lu\|lv\|ly\|ma\|mc\|md\|me\|mg\|mh\|"`
			`"mk\|ml\|mm\|mn\|mo\|mp\|mq\|mr\|ms\|mt\|mu\|mv\|mw\|mx\|my\|mz\|na\|nc\|ne\|nf\|ng\|ni\|nl\|no\|np\|"`
			`"nr\|nu\|nz\|om\|pa\|pe\|pf\|pg\|ph\|pk\|pl\|pm\|pn\|pr\|ps\|pt\|pw\|py\|qa\|re\|ro\|rs\|ru\|rw\|sa\|"`
			`"sb\|sc\|sd\|se\|sg\|sh\|si\|sj\|sk\|sl\|sm\|sn\|so\|sr\|ss\|st\|su\|sv\|sy\|sz\|tc\|td\|tf\|tg\|th\|"`
			`"tj\|tk\|tl\|tm\|tn\|to\|tp\|tr\|tt\|tv\|tw\|tz\|ua\|ug\|uk\|us\|uy\|uz\|va\|vc\|ve\|vg\|vi\|vn\|vu\|"`
			`"wf\|ws\|ye\|yt\|za\|zm\|zw".split('\|'))`

* Implement is_number 2014-11-01 11:13:24 +03:00
* Make PyPy work 2015-01-05 09:54:13 +03:00			`cpdef bint like_url(unicode string):`
* Add is_urlish function 2014-11-01 09:39:34 +03:00			`# We're looking for things that function in text like URLs. So, valid URL`
			`# or not, anything they say http:// is going to be good.`
* Fix https for url detection 2015-08-23 03:40:35 +03:00			`if string.startswith('http://') or string.startswith('https://'):`
* Add is_urlish function 2014-11-01 09:39:34 +03:00			`return True`
			`elif string.startswith('www.') and len(string) >= 5:`
			`return True`
			`# No dots? Not URLish enough`
* Make PyPy work 2015-01-05 09:54:13 +03:00			`if string[0] == '.' or string[-1] == '.':`
			`return False`
			`# This should be a call to "in", but PyPy lacks this function?`
			`cdef int i`
			`for i in range(len(string)):`
			`if string[i] == '.':`
			`break`
			`else:`
* Add is_urlish function 2014-11-01 09:39:34 +03:00			`return False`
			`tld = string.rsplit('.', 1)[1].split(':', 1)[0]`
			`if tld.endswith('/'):`
			`return True`

			`if tld.isalpha() and tld in TLDs:`
			`return True`
			`return False`


* Work on language-independent refactoring 2015-08-23 21:49:18 +03:00			`# TODO: This should live in the language.orth`
* Implement is_number 2014-11-01 11:13:24 +03:00			`NUM_WORDS = set('zero one two three four five six seven eight nine ten'`
			`'eleven twelve thirteen fourteen fifteen sixteen seventeen'`
			`'eighteen nineteen twenty thirty forty fifty sixty seventy'`
			`'eighty ninety hundred thousand million billion trillion'`
			`'quadrillion gajillion bazillion'.split())`
* Make PyPy work 2015-01-05 09:54:13 +03:00			`cpdef bint like_number(unicode string):`
* Implement is_number 2014-11-01 11:13:24 +03:00			`string = string.replace(',', '')`
			`string = string.replace('.', '')`
			`if string.isdigit():`
			`return True`
			`if string.count('/') == 1:`
			`num, denom = string.split('/')`
* Add LIKE_URL and LIKE_NUMBER flag features 2014-11-02 05:19:05 +03:00			`if like_number(num) and like_number(denom):`
* Implement is_number 2014-11-01 11:13:24 +03:00			`return True`
			`if string in NUM_WORDS:`
			`return True`
			`return False`
* Add orth features 2014-08-30 21:01:00 +04:00
* Pass tests. Need to implement more feature functions. 2014-08-30 22:36:06 +04:00
* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects 2015-07-26 17:37:16 +03:00			`_like_email = re.compile(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)").match`
			`cpdef bint like_email(unicode string):`
			`return _like_email(string)`


* Make PyPy work 2015-01-05 09:54:13 +03:00			`cpdef unicode word_shape(unicode string):`
* Add length cap to word shape feature 2015-07-20 13:06:59 +03:00			`if len(string) >= 100:`
			`return 'LONG'`
* Add orth features 2014-08-30 21:01:00 +04:00			`length = len(string)`
* Add offsets to Tokens class. Some changes to interfaces, and reorganization of spacy.Lang 2014-10-14 08:47:06 +04:00			`shape = []`
* Add orth features 2014-08-30 21:01:00 +04:00			`last = ""`
			`shape_char = ""`
			`seq = 0`
			`for c in string:`
			`if c.isalpha():`
			`if c.isupper():`
			`shape_char = "X"`
			`else:`
			`shape_char = "x"`
			`elif c.isdigit():`
			`shape_char = "d"`
			`else:`
			`shape_char = c`
			`if shape_char == last:`
			`seq += 1`
			`else:`
			`seq = 0`
			`last = shape_char`
* Refine word_shape feature, by trimming the max sequence length 2014-11-06 20:41:29 +03:00			`if seq < 4:`
* Add offsets to Tokens class. Some changes to interfaces, and reorganization of spacy.Lang 2014-10-14 08:47:06 +04:00			`shape.append(shape_char)`
			`return ''.join(shape)`
* Add orth features 2014-08-30 21:01:00 +04:00

* Work on word vectors, and other stuff 2015-01-17 08:21:17 +03:00			`cpdef unicode norm1(unicode string, lower_pc=0.0, upper_pc=0.0, title_pc=0.0):`
			`"""Apply level 1 normalization:`

			`* Case is canonicalized, using frequency statistics`
			`* Unicode mapped to ascii, via unidecode`
			`* Regional spelling variations are normalized`
			`"""`
			`pass`


* Make PyPy work 2015-01-05 09:54:13 +03:00			`cpdef bytes asciied(unicode string):`
* Fix type declaration in asciied function 2015-10-09 05:46:57 +03:00			`stripped = unidecode(string)`
* Finally get string types right for orth function 2015-01-05 19:17:39 +03:00			`if not stripped:`
* Fix unicode error in orth 2015-01-04 21:53:08 +03:00			`return b'???'`
* Finally get string types right for orth function 2015-01-05 19:17:39 +03:00			`return stripped.encode('ascii')`
* Work on word vectors, and other stuff 2015-01-17 08:21:17 +03:00

			`# Exceptions --- do not convert these`
			`_uk_us_except = set([`
			`'our',`
			`'ours',`
			`'four',`
			`'fours',`
			`'your',`
			`'yours',`
			`'hour',`
			`'hours',`
			`'course',`
			`'rise',`
			`])`
			`def uk_to_usa(unicode string):`
			`if not string.islower():`
			`return string`
			`if string in _uk_us_except:`
			`return string`
			`our = re.compile(r'ours?$')`
			`string = our.sub('or', string)`

			`return string`