* Merge master into rethinc2

2025-12-18 15:44:34 +03:00 · 2016-02-05 12:55:59 +01:00 · 2016-02-05 12:55:59 +01:00 · 1ef84a0557
commit 1ef84a0557
parent 4cf34fc170 a66e2f2f53
10 changed files with 105 additions and 5 deletions
--- a/spacy/attrs.pxd
+++ b/spacy/attrs.pxd
@ -85,3 +85,11 @@ cpdef enum attr_id_t:
    HEAD
    SPACY
    PROB
 # Move these up to FLAG14--FLAG18 once we finish the functionality and
 # are ready to regenerate the model
 #IS_BRACKET
 #IS_QUOTE
 #IS_LEFT_PUNCT
 #IS_RIGHT_PUNCT
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@ -13,7 +13,6 @@ IDS = {
    "LIKE_EMAIL": LIKE_EMAIL,
    "IS_STOP": IS_STOP,
    "IS_OOV": IS_OOV,
    "FLAG14": FLAG14,
    "FLAG15": FLAG15,
    "FLAG16": FLAG16,
--- a/spacy/language.py
+++ b/spacy/language.py
@ -82,6 +82,22 @@ class Language(object):
    def is_title(string):
        return orth.is_title(string)
    @staticmethod
    def is_bracket(string):
        return orth.is_bracket(string)
    @staticmethod
    def is_quote(string):
        return orth.is_quote(string)
    @staticmethod
    def is_left_punct(string):
        return orth.is_left_punct(string)
    @staticmethod
    def is_right_punct(string):
        return orth.is_right_punct(string)
    @staticmethod
    def is_upper(string):
        return orth.is_upper(string)
@ -121,6 +137,10 @@ class Language(object):
            attrs.IS_SPACE: cls.is_space,
            attrs.IS_TITLE: cls.is_title,
            attrs.IS_UPPER: cls.is_upper,
            attrs.FLAG14: cls.is_bracket,
            attrs.FLAG15: cls.is_quote,
            attrs.FLAG16: cls.is_left_punct,
            attrs.FLAG17: cls.is_right_punct,
            attrs.LIKE_URL: cls.like_url,
            attrs.LIKE_NUM: cls.like_num,
            attrs.LIKE_EMAIL: cls.like_email,
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -18,6 +18,10 @@ import numpy
 from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
 from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
 from .attrs cimport FLAG14 as IS_BRACKET
 from .attrs cimport FLAG15 as IS_QUOTE
 from .attrs cimport FLAG16 as IS_LEFT_PUNCT
 from .attrs cimport FLAG17 as IS_RIGHT_PUNCT
 from .attrs cimport IS_OOV
@ -183,6 +187,23 @@ cdef class Lexeme:
        def __get__(self): return Lexeme.c_check_flag(self.c, IS_SPACE)
        def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_SPACE, x)
    property is_bracket: 
        def __get__(self): return Lexeme.c_check_flag(self.c, IS_BRACKET)
        def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_BRACKET, x)
    property is_quote: 
        def __get__(self): return Lexeme.c_check_flag(self.c, IS_QUOTE)
        def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_QUOTE, x)
    property is_left_punct: 
        def __get__(self): return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT)
        def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x)
    property is_right_punct: 
        def __get__(self): return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)
        def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)
    property like_url:
        def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_URL)
        def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_URL, x)
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@ -15,7 +15,6 @@ from libcpp.vector cimport vector
 from murmurhash.mrmr cimport hash64
 from .attrs cimport LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE
 from .attrs cimport FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25
 from .tokens.doc cimport get_token_attr
 from .tokens.doc cimport Doc
 from .vocab cimport Vocab
--- a/spacy/orth.pyx
+++ b/spacy/orth.pyx
@ -1,4 +1,5 @@
 # -*- coding: utf8 -*-
 # cython: infer_types=True
 from __future__ import unicode_literals
 import unicodedata
@ -48,6 +49,21 @@ cpdef bint is_ascii(unicode string):
    else:
        return True
 cpdef bint is_bracket(unicode string):
    return False
 cpdef bint is_quote(unicode string):
    if string in ('"', "'"):
        return True
    else:
        return False
 cpdef bint is_left_punct(unicode string):
    return False
 cpdef bint is_right_punct(unicode string):
    return False
 cpdef bint is_title(unicode string):
    return string.istitle()
--- a/spacy/symbols.pxd
+++ b/spacy/symbols.pxd
@ -14,7 +14,7 @@ cpdef enum symbol_t:
    IS_STOP
    IS_OOV
-    FLAG14
+    FLAG14 = 14
    FLAG15
    FLAG16
    FLAG17
@ -419,3 +419,10 @@ cpdef enum symbol_t:
    rcmod
    root
    xcomp
 # Move these up to FLAG14--FLAG18 once we finish the functionality
 # and are ready to regenerate the model.
 #IS_BRACKET
 #IS_QUOTE
 #IS_LEFT_PUNCT
 #IS_RIGHT_PUNCT
--- a/spacy/symbols.pyx
+++ b/spacy/symbols.pyx
@ -13,7 +13,6 @@ IDS = {
    "LIKE_EMAIL": LIKE_EMAIL,
    "IS_STOP": IS_STOP,
    "IS_OOV": IS_OOV,
    "FLAG14": FLAG14,
    "FLAG15": FLAG15,
    "FLAG16": FLAG16,
--- a/spacy/tests/vocab/test_flag_features.py
+++ b/spacy/tests/vocab/test_flag_features.py
@ -41,3 +41,18 @@ def test_is_digit(words):
    assert not is_digit(words[7])
    assert not is_digit(words[8])
    assert not is_digit(words[9])
 def test_is_quote(words):
    pass
 def test_is_bracket(words):
    pass
 def test_is_left_bracket(words):
    pass
 def test_is_right_bracket(words):
    pass
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -18,6 +18,10 @@ from ..attrs cimport POS, LEMMA, TAG, DEP
 from ..parts_of_speech cimport CONJ, PUNCT
 from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
 from ..attrs cimport FLAG14 as IS_BRACKET
 from ..attrs cimport FLAG15 as IS_QUOTE
 from ..attrs cimport FLAG16 as IS_LEFT_PUNCT
 from ..attrs cimport FLAG17 as IS_RIGHT_PUNCT
 from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
 from ..attrs cimport IS_OOV
@ -363,6 +367,18 @@ cdef class Token:
    property is_space: 
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_SPACE)
    property is_bracket: 
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_BRACKET)
    property is_quote: 
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_QUOTE)
    property is_left_punct: 
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT)
    property is_right_punct: 
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT)
    property like_url:
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_URL)