spaCy/spacy/orth.py

168 lines
4.6 KiB
Python
Raw Normal View History

# -*- coding: utf8 -*-
from __future__ import unicode_literals
import unicodedata
from unidecode import unidecode
import math
2014-09-10 20:11:13 +04:00
TAGS = 'adj adp adv conj det noun num pdt pos pron prt punct verb'.upper().split()
2014-08-30 21:01:00 +04:00
# Binary string features
def is_alpha(string):
return string.isalpha()
2014-08-30 21:01:00 +04:00
def is_digit(string):
return string.isdigit()
2014-08-30 21:01:00 +04:00
def is_punct(string):
for c in string:
2014-09-02 01:41:31 +04:00
if not unicodedata.category(c).startswith('P'):
return False
else:
2014-09-02 01:41:31 +04:00
return True
2014-08-30 21:01:00 +04:00
def is_space(string):
return string.isspace()
2014-08-30 21:01:00 +04:00
def is_ascii(string):
for c in string:
2014-09-02 01:41:31 +04:00
if ord(c) >= 128:
return False
else:
return True
2014-08-30 21:01:00 +04:00
def is_title(string):
return string.istitle()
2014-08-30 21:01:00 +04:00
def is_lower(string):
return string.islower()
2014-08-30 21:01:00 +04:00
def is_upper(string):
return string.isupper()
2014-08-30 21:01:00 +04:00
2014-11-01 11:13:24 +03:00
2014-11-01 09:39:34 +03:00
TLDs = set("com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|"
"name|pro|tel|travel|xxx|"
"ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|"
"bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|"
"co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|"
"fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|"
"hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|"
"km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|"
"mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|"
"nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|"
"sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|ss|st|su|sv|sy|sz|tc|td|tf|tg|th|"
"tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|"
"wf|ws|ye|yt|za|zm|zw".split('|'))
2014-11-01 11:13:24 +03:00
def like_url(string):
2014-11-01 09:39:34 +03:00
# We're looking for things that function in text like URLs. So, valid URL
# or not, anything they say http:// is going to be good.
if string.startswith('http://'):
return True
elif string.startswith('www.') and len(string) >= 5:
return True
# No dots? Not URLish enough
if string[0] == '.' or string[-1] == '.' or '.' not in string:
return False
tld = string.rsplit('.', 1)[1].split(':', 1)[0]
if tld.endswith('/'):
return True
if tld.isalpha() and tld in TLDs:
return True
return False
2014-11-01 11:13:24 +03:00
NUM_WORDS = set('zero one two three four five six seven eight nine ten'
'eleven twelve thirteen fourteen fifteen sixteen seventeen'
'eighteen nineteen twenty thirty forty fifty sixty seventy'
'eighty ninety hundred thousand million billion trillion'
'quadrillion gajillion bazillion'.split())
def like_number(string):
2014-11-01 11:13:24 +03:00
string = string.replace(',', '')
string = string.replace('.', '')
if string.isdigit():
return True
if string.count('/') == 1:
num, denom = string.split('/')
if like_number(num) and like_number(denom):
2014-11-01 11:13:24 +03:00
return True
if string in NUM_WORDS:
return True
return False
2014-08-30 21:01:00 +04:00
# Statistics features
def oft_case(name, thresh):
def wrapped(string, prob, case_stats, tag_stats):
return string
return wrapped
2014-09-10 20:27:44 +04:00
def can_tag(name, thresh=0.5):
2014-08-30 21:01:00 +04:00
def wrapped(string, prob, case_stats, tag_stats):
return string
return wrapped
# String features
2014-10-29 15:19:38 +03:00
def canon_case(string, upper_pc=0.0, title_pc=0.0, lower_pc=0.0):
2014-08-30 22:57:43 +04:00
if upper_pc >= lower_pc and upper_pc >= title_pc:
return string.upper()
elif title_pc >= lower_pc:
return string.title()
else:
return string.lower()
2014-08-30 21:01:00 +04:00
2014-10-29 15:19:38 +03:00
def word_shape(string):
2014-08-30 21:01:00 +04:00
length = len(string)
shape = []
2014-08-30 21:01:00 +04:00
last = ""
shape_char = ""
seq = 0
for c in string:
if c.isalpha():
if c.isupper():
shape_char = "X"
else:
shape_char = "x"
elif c.isdigit():
shape_char = "d"
else:
shape_char = c
if shape_char == last:
seq += 1
else:
seq = 0
last = shape_char
if seq < 5:
shape.append(shape_char)
return ''.join(shape)
2014-08-30 21:01:00 +04:00
2014-10-29 15:19:38 +03:00
def non_sparse(string, prob, cluster, upper_pc, title_pc, lower_pc):
if is_alpha(string):
2014-10-29 15:19:38 +03:00
return canon_case(string, upper_pc, title_pc, lower_pc)
elif prob >= math.log(0.0001):
return string
else:
2014-10-29 15:19:38 +03:00
return word_shape(string)
2014-10-29 15:19:38 +03:00
def asciied(string):
ascii_string = unidecode(string)
2014-11-01 09:39:34 +03:00
if not ascii_string:
return '???'
return ascii_string.decode('ascii')