mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 18:06:29 +03:00
* Merge master into rethinc2
This commit is contained in:
commit
1ef84a0557
|
@ -85,3 +85,11 @@ cpdef enum attr_id_t:
|
||||||
HEAD
|
HEAD
|
||||||
SPACY
|
SPACY
|
||||||
PROB
|
PROB
|
||||||
|
|
||||||
|
# Move these up to FLAG14--FLAG18 once we finish the functionality and
|
||||||
|
# are ready to regenerate the model
|
||||||
|
#IS_BRACKET
|
||||||
|
#IS_QUOTE
|
||||||
|
#IS_LEFT_PUNCT
|
||||||
|
#IS_RIGHT_PUNCT
|
||||||
|
|
||||||
|
|
|
@ -13,7 +13,6 @@ IDS = {
|
||||||
"LIKE_EMAIL": LIKE_EMAIL,
|
"LIKE_EMAIL": LIKE_EMAIL,
|
||||||
"IS_STOP": IS_STOP,
|
"IS_STOP": IS_STOP,
|
||||||
"IS_OOV": IS_OOV,
|
"IS_OOV": IS_OOV,
|
||||||
|
|
||||||
"FLAG14": FLAG14,
|
"FLAG14": FLAG14,
|
||||||
"FLAG15": FLAG15,
|
"FLAG15": FLAG15,
|
||||||
"FLAG16": FLAG16,
|
"FLAG16": FLAG16,
|
||||||
|
|
|
@ -82,6 +82,22 @@ class Language(object):
|
||||||
def is_title(string):
|
def is_title(string):
|
||||||
return orth.is_title(string)
|
return orth.is_title(string)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def is_bracket(string):
|
||||||
|
return orth.is_bracket(string)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def is_quote(string):
|
||||||
|
return orth.is_quote(string)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def is_left_punct(string):
|
||||||
|
return orth.is_left_punct(string)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def is_right_punct(string):
|
||||||
|
return orth.is_right_punct(string)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def is_upper(string):
|
def is_upper(string):
|
||||||
return orth.is_upper(string)
|
return orth.is_upper(string)
|
||||||
|
@ -121,6 +137,10 @@ class Language(object):
|
||||||
attrs.IS_SPACE: cls.is_space,
|
attrs.IS_SPACE: cls.is_space,
|
||||||
attrs.IS_TITLE: cls.is_title,
|
attrs.IS_TITLE: cls.is_title,
|
||||||
attrs.IS_UPPER: cls.is_upper,
|
attrs.IS_UPPER: cls.is_upper,
|
||||||
|
attrs.FLAG14: cls.is_bracket,
|
||||||
|
attrs.FLAG15: cls.is_quote,
|
||||||
|
attrs.FLAG16: cls.is_left_punct,
|
||||||
|
attrs.FLAG17: cls.is_right_punct,
|
||||||
attrs.LIKE_URL: cls.like_url,
|
attrs.LIKE_URL: cls.like_url,
|
||||||
attrs.LIKE_NUM: cls.like_num,
|
attrs.LIKE_NUM: cls.like_num,
|
||||||
attrs.LIKE_EMAIL: cls.like_email,
|
attrs.LIKE_EMAIL: cls.like_email,
|
||||||
|
|
|
@ -18,6 +18,10 @@ import numpy
|
||||||
|
|
||||||
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
||||||
from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
|
from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
|
||||||
|
from .attrs cimport FLAG14 as IS_BRACKET
|
||||||
|
from .attrs cimport FLAG15 as IS_QUOTE
|
||||||
|
from .attrs cimport FLAG16 as IS_LEFT_PUNCT
|
||||||
|
from .attrs cimport FLAG17 as IS_RIGHT_PUNCT
|
||||||
from .attrs cimport IS_OOV
|
from .attrs cimport IS_OOV
|
||||||
|
|
||||||
|
|
||||||
|
@ -183,6 +187,23 @@ cdef class Lexeme:
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c, IS_SPACE)
|
def __get__(self): return Lexeme.c_check_flag(self.c, IS_SPACE)
|
||||||
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_SPACE, x)
|
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_SPACE, x)
|
||||||
|
|
||||||
|
property is_bracket:
|
||||||
|
def __get__(self): return Lexeme.c_check_flag(self.c, IS_BRACKET)
|
||||||
|
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_BRACKET, x)
|
||||||
|
|
||||||
|
property is_quote:
|
||||||
|
def __get__(self): return Lexeme.c_check_flag(self.c, IS_QUOTE)
|
||||||
|
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_QUOTE, x)
|
||||||
|
|
||||||
|
property is_left_punct:
|
||||||
|
def __get__(self): return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT)
|
||||||
|
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x)
|
||||||
|
|
||||||
|
property is_right_punct:
|
||||||
|
def __get__(self): return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)
|
||||||
|
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)
|
||||||
|
|
||||||
|
|
||||||
property like_url:
|
property like_url:
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_URL)
|
def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_URL)
|
||||||
def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_URL, x)
|
def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_URL, x)
|
||||||
|
|
|
@ -15,7 +15,6 @@ from libcpp.vector cimport vector
|
||||||
from murmurhash.mrmr cimport hash64
|
from murmurhash.mrmr cimport hash64
|
||||||
|
|
||||||
from .attrs cimport LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE
|
from .attrs cimport LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE
|
||||||
from .attrs cimport FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25
|
|
||||||
from .tokens.doc cimport get_token_attr
|
from .tokens.doc cimport get_token_attr
|
||||||
from .tokens.doc cimport Doc
|
from .tokens.doc cimport Doc
|
||||||
from .vocab cimport Vocab
|
from .vocab cimport Vocab
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
# -*- coding: utf8 -*-
|
# -*- coding: utf8 -*-
|
||||||
|
# cython: infer_types=True
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
import unicodedata
|
import unicodedata
|
||||||
|
|
||||||
|
@ -48,6 +49,21 @@ cpdef bint is_ascii(unicode string):
|
||||||
else:
|
else:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
cpdef bint is_bracket(unicode string):
|
||||||
|
return False
|
||||||
|
|
||||||
|
cpdef bint is_quote(unicode string):
|
||||||
|
if string in ('"', "'"):
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
cpdef bint is_left_punct(unicode string):
|
||||||
|
return False
|
||||||
|
|
||||||
|
cpdef bint is_right_punct(unicode string):
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
cpdef bint is_title(unicode string):
|
cpdef bint is_title(unicode string):
|
||||||
return string.istitle()
|
return string.istitle()
|
||||||
|
|
|
@ -14,7 +14,7 @@ cpdef enum symbol_t:
|
||||||
IS_STOP
|
IS_STOP
|
||||||
IS_OOV
|
IS_OOV
|
||||||
|
|
||||||
FLAG14
|
FLAG14 = 14
|
||||||
FLAG15
|
FLAG15
|
||||||
FLAG16
|
FLAG16
|
||||||
FLAG17
|
FLAG17
|
||||||
|
@ -419,3 +419,10 @@ cpdef enum symbol_t:
|
||||||
rcmod
|
rcmod
|
||||||
root
|
root
|
||||||
xcomp
|
xcomp
|
||||||
|
|
||||||
|
# Move these up to FLAG14--FLAG18 once we finish the functionality
|
||||||
|
# and are ready to regenerate the model.
|
||||||
|
#IS_BRACKET
|
||||||
|
#IS_QUOTE
|
||||||
|
#IS_LEFT_PUNCT
|
||||||
|
#IS_RIGHT_PUNCT
|
||||||
|
|
|
@ -13,7 +13,6 @@ IDS = {
|
||||||
"LIKE_EMAIL": LIKE_EMAIL,
|
"LIKE_EMAIL": LIKE_EMAIL,
|
||||||
"IS_STOP": IS_STOP,
|
"IS_STOP": IS_STOP,
|
||||||
"IS_OOV": IS_OOV,
|
"IS_OOV": IS_OOV,
|
||||||
|
|
||||||
"FLAG14": FLAG14,
|
"FLAG14": FLAG14,
|
||||||
"FLAG15": FLAG15,
|
"FLAG15": FLAG15,
|
||||||
"FLAG16": FLAG16,
|
"FLAG16": FLAG16,
|
||||||
|
|
|
@ -41,3 +41,18 @@ def test_is_digit(words):
|
||||||
assert not is_digit(words[7])
|
assert not is_digit(words[7])
|
||||||
assert not is_digit(words[8])
|
assert not is_digit(words[8])
|
||||||
assert not is_digit(words[9])
|
assert not is_digit(words[9])
|
||||||
|
|
||||||
|
|
||||||
|
def test_is_quote(words):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def test_is_bracket(words):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def test_is_left_bracket(words):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def test_is_right_bracket(words):
|
||||||
|
pass
|
||||||
|
|
|
@ -18,6 +18,10 @@ from ..attrs cimport POS, LEMMA, TAG, DEP
|
||||||
from ..parts_of_speech cimport CONJ, PUNCT
|
from ..parts_of_speech cimport CONJ, PUNCT
|
||||||
|
|
||||||
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
||||||
|
from ..attrs cimport FLAG14 as IS_BRACKET
|
||||||
|
from ..attrs cimport FLAG15 as IS_QUOTE
|
||||||
|
from ..attrs cimport FLAG16 as IS_LEFT_PUNCT
|
||||||
|
from ..attrs cimport FLAG17 as IS_RIGHT_PUNCT
|
||||||
from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
|
from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
|
||||||
from ..attrs cimport IS_OOV
|
from ..attrs cimport IS_OOV
|
||||||
|
|
||||||
|
@ -363,6 +367,18 @@ cdef class Token:
|
||||||
property is_space:
|
property is_space:
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_SPACE)
|
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_SPACE)
|
||||||
|
|
||||||
|
property is_bracket:
|
||||||
|
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_BRACKET)
|
||||||
|
|
||||||
|
property is_quote:
|
||||||
|
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_QUOTE)
|
||||||
|
|
||||||
|
property is_left_punct:
|
||||||
|
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT)
|
||||||
|
|
||||||
|
property is_right_punct:
|
||||||
|
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT)
|
||||||
|
|
||||||
property like_url:
|
property like_url:
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_URL)
|
def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_URL)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user