mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
replace Language functions with corresponding orth functions
implement punctuation functions in orth
This commit is contained in:
parent
d9312bc9ea
commit
bc9c62e279
|
@ -34,10 +34,6 @@ class Language(object):
|
|||
def norm(string):
|
||||
return string
|
||||
|
||||
@staticmethod
|
||||
def shape(string):
|
||||
return orth.word_shape(string)
|
||||
|
||||
@staticmethod
|
||||
def prefix(string):
|
||||
return string[0]
|
||||
|
@ -50,66 +46,14 @@ class Language(object):
|
|||
def cluster(string):
|
||||
return 0
|
||||
|
||||
@staticmethod
|
||||
def is_alpha(string):
|
||||
return orth.is_alpha(string)
|
||||
|
||||
@staticmethod
|
||||
def is_ascii(string):
|
||||
return orth.is_ascii(string)
|
||||
|
||||
@staticmethod
|
||||
def is_digit(string):
|
||||
return string.isdigit()
|
||||
|
||||
@staticmethod
|
||||
def is_lower(string):
|
||||
return orth.is_lower(string)
|
||||
|
||||
@staticmethod
|
||||
def is_punct(string):
|
||||
return orth.is_punct(string)
|
||||
|
||||
@staticmethod
|
||||
def is_space(string):
|
||||
return string.isspace()
|
||||
|
||||
@staticmethod
|
||||
def is_title(string):
|
||||
return orth.is_title(string)
|
||||
|
||||
@staticmethod
|
||||
def is_bracket(string):
|
||||
return orth.is_bracket(string)
|
||||
|
||||
@staticmethod
|
||||
def is_quote(string):
|
||||
return orth.is_quote(string)
|
||||
|
||||
@staticmethod
|
||||
def is_left_punct(string):
|
||||
return orth.is_left_punct(string)
|
||||
|
||||
@staticmethod
|
||||
def is_right_punct(string):
|
||||
return orth.is_right_punct(string)
|
||||
|
||||
@staticmethod
|
||||
def is_upper(string):
|
||||
return orth.is_upper(string)
|
||||
|
||||
@staticmethod
|
||||
def like_url(string):
|
||||
return orth.like_url(string)
|
||||
|
||||
@staticmethod
|
||||
def like_num(string):
|
||||
return orth.like_number(string)
|
||||
|
||||
@staticmethod
|
||||
def like_email(string):
|
||||
return orth.like_email(string)
|
||||
|
||||
@staticmethod
|
||||
def is_stop(string):
|
||||
return 0
|
||||
|
@ -120,26 +64,26 @@ class Language(object):
|
|||
return {
|
||||
attrs.LOWER: cls.lower,
|
||||
attrs.NORM: cls.norm,
|
||||
attrs.SHAPE: cls.shape,
|
||||
attrs.SHAPE: orth.word_shape,
|
||||
attrs.PREFIX: cls.prefix,
|
||||
attrs.SUFFIX: cls.suffix,
|
||||
attrs.CLUSTER: cls.cluster,
|
||||
attrs.PROB: lambda string: oov_prob,
|
||||
attrs.IS_ALPHA: cls.is_alpha,
|
||||
attrs.IS_ASCII: cls.is_ascii,
|
||||
attrs.IS_ALPHA: orth.is_alpha,
|
||||
attrs.IS_ASCII: orth.is_ascii,
|
||||
attrs.IS_DIGIT: cls.is_digit,
|
||||
attrs.IS_LOWER: cls.is_lower,
|
||||
attrs.IS_PUNCT: cls.is_punct,
|
||||
attrs.IS_LOWER: orth.is_lower,
|
||||
attrs.IS_PUNCT: orth.is_punct,
|
||||
attrs.IS_SPACE: cls.is_space,
|
||||
attrs.IS_TITLE: cls.is_title,
|
||||
attrs.IS_UPPER: cls.is_upper,
|
||||
attrs.FLAG14: cls.is_bracket,
|
||||
attrs.FLAG15: cls.is_quote,
|
||||
attrs.FLAG16: cls.is_left_punct,
|
||||
attrs.FLAG17: cls.is_right_punct,
|
||||
attrs.LIKE_URL: cls.like_url,
|
||||
attrs.LIKE_NUM: cls.like_num,
|
||||
attrs.LIKE_EMAIL: cls.like_email,
|
||||
attrs.IS_TITLE: orth.is_title,
|
||||
attrs.IS_UPPER: orth.is_upper,
|
||||
attrs.FLAG14: orth.is_bracket,
|
||||
attrs.FLAG15: orth.is_quote,
|
||||
attrs.FLAG16: orth.is_left_punct,
|
||||
attrs.FLAG17: orth.is_right_punct,
|
||||
attrs.LIKE_URL: orth.like_url,
|
||||
attrs.LIKE_NUM: orth.like_number,
|
||||
attrs.LIKE_EMAIL: orth.like_email,
|
||||
attrs.IS_STOP: cls.is_stop,
|
||||
attrs.IS_OOV: lambda string: True
|
||||
}
|
||||
|
|
|
@ -5,9 +5,6 @@ import unicodedata
|
|||
import re
|
||||
|
||||
|
||||
TAGS = 'adj adp adv conj det noun num pdt pos pron prt punct verb'.upper().split()
|
||||
|
||||
|
||||
# Binary string features
|
||||
cpdef bint is_alpha(unicode string):
|
||||
return string.isalpha()
|
||||
|
@ -36,20 +33,25 @@ cpdef bint is_ascii(unicode string):
|
|||
else:
|
||||
return True
|
||||
|
||||
|
||||
cpdef bint is_bracket(unicode string):
|
||||
return False
|
||||
brackets = ('(',')','[',']','{','}','<','>')
|
||||
return string in brackets
|
||||
|
||||
|
||||
cpdef bint is_quote(unicode string):
|
||||
if string in ('"', "'"):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
quotes = ('"',"'",'`','«','»','‘','’','‚','‛','“','”','„','‟','‹','›','❮','❯')
|
||||
return string in quotes
|
||||
|
||||
|
||||
cpdef bint is_left_punct(unicode string):
|
||||
return False
|
||||
left_punct = ('(','[','{','<','"',"'",'«','‘','‚','‛','“','„','‟','‹','❮')
|
||||
return string in left_punct
|
||||
|
||||
|
||||
cpdef bint is_right_punct(unicode string):
|
||||
return False
|
||||
right_punct = (')',']','}','>','"',"'",'»','’','”','›','❯')
|
||||
return string in right_punct
|
||||
|
||||
|
||||
cpdef bint is_title(unicode string):
|
||||
|
|
Loading…
Reference in New Issue
Block a user