spaCy/spacy/orth.py

# -*- coding: utf8 -*-
from __future__ import unicode_literals
import unicodedata
from unidecode import unidecode

import math


TAGS = 'adj adp adv conj det noun num pdt pos pron prt punct verb'.upper().split()


# Binary string features
def is_alpha(string):
    return string.isalpha()


def is_digit(string):
    return string.isdigit()


def is_punct(string):
    for c in string:
        if not unicodedata.category(c).startswith('P'):
            return False
    else:
        return True


def is_space(string):
    return string.isspace()


def is_ascii(string):
    for c in string:
        if ord(c) >= 128:
            return False
    else:
        return True


def is_title(string):
    return string.istitle()


def is_lower(string):
    return string.islower()


def is_upper(string):
    return string.isupper()


TLDs = set("com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|"
        "name|pro|tel|travel|xxx|"
        "ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|"
        "bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|"
        "co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|"
        "fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|"
        "hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|"
        "km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|"
        "mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|"
        "nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|"
        "sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|ss|st|su|sv|sy|sz|tc|td|tf|tg|th|"
        "tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|"
        "wf|ws|ye|yt|za|zm|zw".split('|'))


def like_url(string):
    # We're looking for things that function in text like URLs. So, valid URL
    # or not, anything they say http:// is going to be good.
    if string.startswith('http://'):
        return True
    elif string.startswith('www.') and len(string) >= 5:
        return True
    # No dots? Not URLish enough
    if string[0] == '.' or string[-1] == '.' or '.' not in string:
        return False
    tld = string.rsplit('.', 1)[1].split(':', 1)[0]
    if tld.endswith('/'):
        return True

    if tld.isalpha() and tld in TLDs:
        return True
    return False


NUM_WORDS = set('zero one two three four five six seven eight nine ten'
                'eleven twelve thirteen fourteen fifteen sixteen seventeen'
                'eighteen nineteen twenty thirty forty fifty sixty seventy'
                'eighty ninety hundred thousand million billion trillion'
                'quadrillion gajillion bazillion'.split())
def like_number(string):
    string = string.replace(',', '')
    string = string.replace('.', '')
    if string.isdigit():
        return True
    if string.count('/') == 1:
        num, denom = string.split('/')
        if like_number(num) and like_number(denom):
            return True
    if string in NUM_WORDS:
        return True
    return False

# Statistics features
def oft_case(name, thresh):
    def wrapped(string, prob, case_stats, tag_stats):
        return string
    return wrapped


def can_tag(name, thresh=0.5):
    def wrapped(string, prob, case_stats, tag_stats):
        return string
    return wrapped


# String features
def canon_case(string, upper_pc=0.0, title_pc=0.0, lower_pc=0.0):
    if upper_pc >= lower_pc and upper_pc >= title_pc:
        return string.upper()
    elif title_pc >= lower_pc:
        return string.title()
    else:
        return string.lower()


def word_shape(string):
    length = len(string)
    shape = []
    last = ""
    shape_char = ""
    seq = 0
    for c in string:
        if c.isalpha():
            if c.isupper():
                shape_char = "X"
            else:
                shape_char = "x"
        elif c.isdigit():
            shape_char = "d"
        else:
            shape_char = c
        if shape_char == last:
            seq += 1
        else:
            seq = 0
            last = shape_char
        if seq < 5:
            shape.append(shape_char)
    return ''.join(shape)


def non_sparse(string, prob, cluster, upper_pc, title_pc, lower_pc):
    if is_alpha(string):
        return canon_case(string, upper_pc, title_pc, lower_pc)
    elif prob >= math.log(0.0001):
        return string
    else:
        return word_shape(string)


def asciied(string):
    ascii_string = unidecode(string)
    if not ascii_string:
        return '???'
    return ascii_string.decode('ascii')
* Add asciify string transform, and other bits. 2014-09-02 01:25:28 +04:00			`# -- coding: utf8 --`
* Pass tests. Need to implement more feature functions. 2014-08-30 22:36:06 +04:00			`from __future__ import unicode_literals`
* Add asciify string transform, and other bits. 2014-09-02 01:25:28 +04:00			`import unicodedata`
* Switch to new data model, tests passing 2014-10-10 01:11:31 +04:00			`from unidecode import unidecode`
* Add asciify string transform, and other bits. 2014-09-02 01:25:28 +04:00
			`import math`
* Pass tests. Need to implement more feature functions. 2014-08-30 22:36:06 +04:00
* Refactoring to use Tokens object 2014-09-10 20:11:13 +04:00
			`TAGS = 'adj adp adv conj det noun num pdt pos pron prt punct verb'.upper().split()`


* Add orth features 2014-08-30 21:01:00 +04:00			`# Binary string features`
* Switch to new data model, tests passing 2014-10-10 01:11:31 +04:00			`def is_alpha(string):`
* Add asciify string transform, and other bits. 2014-09-02 01:25:28 +04:00			`return string.isalpha()`

* Add orth features 2014-08-30 21:01:00 +04:00
* Switch to new data model, tests passing 2014-10-10 01:11:31 +04:00			`def is_digit(string):`
* Add asciify string transform, and other bits. 2014-09-02 01:25:28 +04:00			`return string.isdigit()`

* Add orth features 2014-08-30 21:01:00 +04:00
* Switch to new data model, tests passing 2014-10-10 01:11:31 +04:00			`def is_punct(string):`
* Add asciify string transform, and other bits. 2014-09-02 01:25:28 +04:00			`for c in string:`
* Bug fixes to flag features 2014-09-02 01:41:31 +04:00			`if not unicodedata.category(c).startswith('P'):`
			`return False`
* Add asciify string transform, and other bits. 2014-09-02 01:25:28 +04:00			`else:`
* Bug fixes to flag features 2014-09-02 01:41:31 +04:00			`return True`
* Add asciify string transform, and other bits. 2014-09-02 01:25:28 +04:00
* Add orth features 2014-08-30 21:01:00 +04:00
* Switch to new data model, tests passing 2014-10-10 01:11:31 +04:00			`def is_space(string):`
* Add asciify string transform, and other bits. 2014-09-02 01:25:28 +04:00			`return string.isspace()`

* Add orth features 2014-08-30 21:01:00 +04:00
* Switch to new data model, tests passing 2014-10-10 01:11:31 +04:00			`def is_ascii(string):`
* Add asciify string transform, and other bits. 2014-09-02 01:25:28 +04:00			`for c in string:`
* Bug fixes to flag features 2014-09-02 01:41:31 +04:00			`if ord(c) >= 128:`
* Add asciify string transform, and other bits. 2014-09-02 01:25:28 +04:00			`return False`
			`else:`
			`return True`

* Add orth features 2014-08-30 21:01:00 +04:00
* Switch to new data model, tests passing 2014-10-10 01:11:31 +04:00			`def is_title(string):`
* Add asciify string transform, and other bits. 2014-09-02 01:25:28 +04:00			`return string.istitle()`

* Add orth features 2014-08-30 21:01:00 +04:00
* Switch to new data model, tests passing 2014-10-10 01:11:31 +04:00			`def is_lower(string):`
* Add asciify string transform, and other bits. 2014-09-02 01:25:28 +04:00			`return string.islower()`

* Add orth features 2014-08-30 21:01:00 +04:00
* Switch to new data model, tests passing 2014-10-10 01:11:31 +04:00			`def is_upper(string):`
* Add asciify string transform, and other bits. 2014-09-02 01:25:28 +04:00			`return string.isupper()`
* Add orth features 2014-08-30 21:01:00 +04:00
* Implement is_number 2014-11-01 11:13:24 +03:00
* Add is_urlish function 2014-11-01 09:39:34 +03:00			`TLDs = set("com\|org\|edu\|gov\|net\|mil\|aero\|asia\|biz\|cat\|coop\|info\|int\|jobs\|mobi\|museum\|"`
			`"name\|pro\|tel\|travel\|xxx\|"`
			`"ac\|ad\|ae\|af\|ag\|ai\|al\|am\|an\|ao\|aq\|ar\|as\|at\|au\|aw\|ax\|az\|ba\|bb\|bd\|be\|bf\|bg\|"`
			`"bh\|bi\|bj\|bm\|bn\|bo\|br\|bs\|bt\|bv\|bw\|by\|bz\|ca\|cc\|cd\|cf\|cg\|ch\|ci\|ck\|cl\|cm\|cn\|"`
			`"co\|cr\|cs\|cu\|cv\|cx\|cy\|cz\|dd\|de\|dj\|dk\|dm\|do\|dz\|ec\|ee\|eg\|eh\|er\|es\|et\|eu\|fi\|"`
			`"fj\|fk\|fm\|fo\|fr\|ga\|gb\|gd\|ge\|gf\|gg\|gh\|gi\|gl\|gm\|gn\|gp\|gq\|gr\|gs\|gt\|gu\|gw\|gy\|"`
			`"hk\|hm\|hn\|hr\|ht\|hu\|id\|ie\|il\|im\|in\|io\|iq\|ir\|is\|it\|je\|jm\|jo\|jp\|ke\|kg\|kh\|ki\|"`
			`"km\|kn\|kp\|kr\|kw\|ky\|kz\|la\|lb\|lc\|li\|lk\|lr\|ls\|lt\|lu\|lv\|ly\|ma\|mc\|md\|me\|mg\|mh\|"`
			`"mk\|ml\|mm\|mn\|mo\|mp\|mq\|mr\|ms\|mt\|mu\|mv\|mw\|mx\|my\|mz\|na\|nc\|ne\|nf\|ng\|ni\|nl\|no\|np\|"`
			`"nr\|nu\|nz\|om\|pa\|pe\|pf\|pg\|ph\|pk\|pl\|pm\|pn\|pr\|ps\|pt\|pw\|py\|qa\|re\|ro\|rs\|ru\|rw\|sa\|"`
			`"sb\|sc\|sd\|se\|sg\|sh\|si\|sj\|sk\|sl\|sm\|sn\|so\|sr\|ss\|st\|su\|sv\|sy\|sz\|tc\|td\|tf\|tg\|th\|"`
			`"tj\|tk\|tl\|tm\|tn\|to\|tp\|tr\|tt\|tv\|tw\|tz\|ua\|ug\|uk\|us\|uy\|uz\|va\|vc\|ve\|vg\|vi\|vn\|vu\|"`
			`"wf\|ws\|ye\|yt\|za\|zm\|zw".split('\|'))`

* Implement is_number 2014-11-01 11:13:24 +03:00
* Add LIKE_URL and LIKE_NUMBER flag features 2014-11-02 05:19:05 +03:00			`def like_url(string):`
* Add is_urlish function 2014-11-01 09:39:34 +03:00			`# We're looking for things that function in text like URLs. So, valid URL`
			`# or not, anything they say http:// is going to be good.`
			`if string.startswith('http://'):`
			`return True`
			`elif string.startswith('www.') and len(string) >= 5:`
			`return True`
			`# No dots? Not URLish enough`
			`if string[0] == '.' or string[-1] == '.' or '.' not in string:`
			`return False`
			`tld = string.rsplit('.', 1)[1].split(':', 1)[0]`
			`if tld.endswith('/'):`
			`return True`

			`if tld.isalpha() and tld in TLDs:`
			`return True`
			`return False`


* Implement is_number 2014-11-01 11:13:24 +03:00			`NUM_WORDS = set('zero one two three four five six seven eight nine ten'`
			`'eleven twelve thirteen fourteen fifteen sixteen seventeen'`
			`'eighteen nineteen twenty thirty forty fifty sixty seventy'`
			`'eighty ninety hundred thousand million billion trillion'`
			`'quadrillion gajillion bazillion'.split())`
* Add LIKE_URL and LIKE_NUMBER flag features 2014-11-02 05:19:05 +03:00			`def like_number(string):`
* Implement is_number 2014-11-01 11:13:24 +03:00			`string = string.replace(',', '')`
			`string = string.replace('.', '')`
			`if string.isdigit():`
			`return True`
			`if string.count('/') == 1:`
			`num, denom = string.split('/')`
* Add LIKE_URL and LIKE_NUMBER flag features 2014-11-02 05:19:05 +03:00			`if like_number(num) and like_number(denom):`
* Implement is_number 2014-11-01 11:13:24 +03:00			`return True`
			`if string in NUM_WORDS:`
			`return True`
			`return False`
* Add orth features 2014-08-30 21:01:00 +04:00
			`# Statistics features`
			`def oft_case(name, thresh):`
			`def wrapped(string, prob, case_stats, tag_stats):`
			`return string`
			`return wrapped`


* Refactor to use tokens class. 2014-09-10 20:27:44 +04:00			`def can_tag(name, thresh=0.5):`
* Add orth features 2014-08-30 21:01:00 +04:00			`def wrapped(string, prob, case_stats, tag_stats):`
			`return string`
			`return wrapped`


			`# String features`
* Rewriting Lexeme serialization. 2014-10-29 15:19:38 +03:00			`def canon_case(string, upper_pc=0.0, title_pc=0.0, lower_pc=0.0):`
* Add canon_case function 2014-08-30 22:57:43 +04:00			`if upper_pc >= lower_pc and upper_pc >= title_pc:`
			`return string.upper()`
			`elif title_pc >= lower_pc:`
			`return string.title()`
			`else:`
			`return string.lower()`
* Add orth features 2014-08-30 21:01:00 +04:00
* Pass tests. Need to implement more feature functions. 2014-08-30 22:36:06 +04:00
* Rewriting Lexeme serialization. 2014-10-29 15:19:38 +03:00			`def word_shape(string):`
* Add orth features 2014-08-30 21:01:00 +04:00			`length = len(string)`
* Add offsets to Tokens class. Some changes to interfaces, and reorganization of spacy.Lang 2014-10-14 08:47:06 +04:00			`shape = []`
* Add orth features 2014-08-30 21:01:00 +04:00			`last = ""`
			`shape_char = ""`
			`seq = 0`
			`for c in string:`
			`if c.isalpha():`
			`if c.isupper():`
			`shape_char = "X"`
			`else:`
			`shape_char = "x"`
			`elif c.isdigit():`
			`shape_char = "d"`
			`else:`
			`shape_char = c`
			`if shape_char == last:`
			`seq += 1`
			`else:`
			`seq = 0`
			`last = shape_char`
* Add asciify string transform, and other bits. 2014-09-02 01:25:28 +04:00			`if seq < 5:`
* Add offsets to Tokens class. Some changes to interfaces, and reorganization of spacy.Lang 2014-10-14 08:47:06 +04:00			`shape.append(shape_char)`
			`return ''.join(shape)`
* Add orth features 2014-08-30 21:01:00 +04:00

* Rewriting Lexeme serialization. 2014-10-29 15:19:38 +03:00			`def non_sparse(string, prob, cluster, upper_pc, title_pc, lower_pc):`
* Switch to new data model, tests passing 2014-10-10 01:11:31 +04:00			`if is_alpha(string):`
* Rewriting Lexeme serialization. 2014-10-29 15:19:38 +03:00			`return canon_case(string, upper_pc, title_pc, lower_pc)`
* Add asciify string transform, and other bits. 2014-09-02 01:25:28 +04:00			`elif prob >= math.log(0.0001):`
			`return string`
			`else:`
* Rewriting Lexeme serialization. 2014-10-29 15:19:38 +03:00			`return word_shape(string)`
* Add asciify string transform, and other bits. 2014-09-02 01:25:28 +04:00

* Rewriting Lexeme serialization. 2014-10-29 15:19:38 +03:00			`def asciied(string):`
* Switch to new data model, tests passing 2014-10-10 01:11:31 +04:00			`ascii_string = unidecode(string)`
* Add is_urlish function 2014-11-01 09:39:34 +03:00			`if not ascii_string:`
			`return '???'`
* Switch to new data model, tests passing 2014-10-10 01:11:31 +04:00			`return ascii_string.decode('ascii')`