Add support for Universal Dependencies v2.0

2026-01-05 16:29:18 +03:00 · 2017-02-26 22:27:11 +01:00 · 2017-02-26 22:27:11 +01:00 · 66e1109b53
commit 66e1109b53
parent 8dff040032
14 changed files with 155 additions and 74 deletions
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@ -93,7 +93,7 @@ NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]

 def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
    '''Normalize a dictionary of attributes, converting them to ints.
-    
+
    Arguments:
        stringy_attrs (dict):
            Dictionary keyed by attribute string names. Values can be ints or strings.
@ -125,7 +125,9 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
            'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss',
            'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType',
            'Number', 'PronType', 'AdjType', 'Person', 'Variant', 'AdpType',
-            'Reflex', 'Negative', 'Mood', 'Aspect', 'Case']
+            'Reflex', 'Negative', 'Mood', 'Aspect', 'Case',
+            'Polarity', # U20
+        ]
        for key in morph_keys:
            if key in stringy_attrs:
                stringy_attrs.pop(key)
--- a/spacy/de/tag_map.py
+++ b/spacy/de/tag_map.py
@ -41,7 +41,7 @@ TAG_MAP = {
    "PRF":      {POS: PRON, "PronType": "prs", "Reflex": "yes"},
    "PTKA":     {POS: PART},
    "PTKANT":   {POS: PART, "PartType": "res"},
-    "PTKNEG":   {POS: PART, "Negative": "yes"},
+    "PTKNEG":   {POS: PART, "Polarity": "Neg"},
    "PTKVZ":    {POS: PART, "PartType": "vbp"},
    "PTKZU":    {POS: PART, "PartType": "inf"},
    "PWAT":     {POS: DET, "PronType": "int"},
--- a/spacy/en/tag_map.py
+++ b/spacy/en/tag_map.py
@ -16,7 +16,7 @@ TAG_MAP = {
    "$":        {POS: SYM, "Other": {"SymType": "currency"}},
    "#":        {POS: SYM, "Other": {"SymType": "numbersign"}},
    "AFX":      {POS: ADJ,  "Hyph": "yes"},
-    "CC":       {POS: CONJ, "ConjType": "coor"},
+    "CC":       {POS: CCONJ, "ConjType": "coor"},
    "CD":       {POS: NUM, "NumType": "card"},
    "DT":       {POS: DET},
    "EX":       {POS: ADV, "AdvType": "ex"},
--- a/spacy/language_data/tag_map.py
+++ b/spacy/language_data/tag_map.py
@ -19,6 +19,7 @@ TAG_MAP = {
    "AUX":      {POS: AUX},
    "X":        {POS: X},
    "CONJ":     {POS: CONJ},
+    "CCONJ":    {POS: CCONJ}, # U20
    "ADJ":      {POS: ADJ},
    "VERB":     {POS: VERB},
    "PART":     {POS: PART}
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@ -37,7 +37,7 @@ cdef class Morphology:
    cdef int assign_tag(self, TokenC* token, tag) except -1

    cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1
-    
+
    cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1


@ -80,6 +80,7 @@ cpdef enum univ_morph_t:
    Definite_two
    Definite_def
    Definite_red
+    Definite_cons # U20
    Definite_ind
    Degree_cmp
    Degree_comp
@ -103,6 +104,8 @@ cpdef enum univ_morph_t:
    Negative_neg
    Negative_pos
    Negative_yes
+    Polarity_neg # U20
+    Polarity_pos # U20
    Number_com
    Number_dual
    Number_none
@ -151,6 +154,7 @@ cpdef enum univ_morph_t:
    VerbForm_partPres
    VerbForm_sup
    VerbForm_trans
+    VerbForm_conv # U20
    VerbForm_gdv # la
    Voice_act
    Voice_cau
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -192,6 +192,7 @@ IDS = {
    "Definite_two": Definite_two,
    "Definite_def": Definite_def,
    "Definite_red": Definite_red,
+    "Definite_cons": Definite_cons, # U20
    "Definite_ind": Definite_ind,
    "Degree_cmp": Degree_cmp,
    "Degree_comp": Degree_comp,
@ -215,6 +216,8 @@ IDS = {
    "Negative_neg": Negative_neg,
    "Negative_pos": Negative_pos,
    "Negative_yes": Negative_yes,
+    "Polarity_neg": Polarity_neg, # U20
+    "Polarity_pos": Polarity_pos, # U20
    "Number_com": Number_com,
    "Number_dual": Number_dual,
    "Number_none": Number_none,
@ -263,6 +266,7 @@ IDS = {
    "VerbForm_partPres": VerbForm_partPres,
    "VerbForm_sup": VerbForm_sup,
    "VerbForm_trans": VerbForm_trans,
+    "VerbForm_conv": VerbForm_conv, # U20
    "VerbForm_gdv ": VerbForm_gdv, # la,
    "Voice_act": Voice_act,
    "Voice_cau": Voice_cau,
--- a/spacy/parts_of_speech.pxd
+++ b/spacy/parts_of_speech.pxd
@ -7,6 +7,7 @@ cpdef enum univ_pos_t:
    ADV
    AUX
    CONJ
+    CCONJ # U20
    DET
    INTJ
    NOUN
--- a/spacy/parts_of_speech.pyx
+++ b/spacy/parts_of_speech.pyx
@ -7,7 +7,8 @@ IDS = {
    "ADP": ADP,
    "ADV": ADV,
    "AUX": AUX,
-    "CONJ": CONJ,
+    "CONJ": CONJ, # U20
+    "CCONJ": CCONJ,
    "DET": DET,
    "INTJ": INTJ,
    "NOUN": NOUN,
--- a/spacy/symbols.pxd
+++ b/spacy/symbols.pxd
@ -13,7 +13,7 @@ cpdef enum symbol_t:
    LIKE_EMAIL
    IS_STOP
    IS_OOV
-    
+
    FLAG14 = 14
    FLAG15
    FLAG16
@ -90,6 +90,7 @@ cpdef enum symbol_t:
    ADV
    AUX
    CONJ
+    CCONJ # U20
    DET
    INTJ
    NOUN
@ -107,11 +108,14 @@ cpdef enum symbol_t:

    Animacy_anim
    Animacy_inam
+    Animacy_hum # U20
    Aspect_freq
    Aspect_imp
    Aspect_mod
    Aspect_none
    Aspect_perf
+    Aspect_iter # U20
+    Aspect_hab # U20
    Case_abe
    Case_abl
    Case_abs
@ -120,10 +124,12 @@ cpdef enum symbol_t:
    Case_all
    Case_cau
    Case_com
+    Case_cmp # U20
    Case_dat
    Case_del
    Case_dis
    Case_ela
+    Case_equ # U20
    Case_ess
    Case_gen
    Case_ill
@ -142,7 +148,9 @@ cpdef enum symbol_t:
    Definite_two
    Definite_def
    Definite_red
+    Definite_cons # U20
    Definite_ind
+    Definite_spec # U20
    Degree_cmp
    Degree_comp
    Degree_none
@ -151,6 +159,8 @@ cpdef enum symbol_t:
    Degree_abs
    Degree_com
    Degree_dim # du
+    Degree_equ # U20
+    Evident_nfh # U20
    Gender_com
    Gender_fem
    Gender_masc
@ -162,16 +172,21 @@ cpdef enum symbol_t:
    Mood_pot
    Mood_sub
    Mood_opt
+    Mood_prp # U20
+    Mood_adm # U20
    Negative_neg
    Negative_pos
    Negative_yes
+    Polarity_neg # U20
+    Polarity_pos # U20
    Number_com
    Number_dual
    Number_none
    Number_plur
    Number_sing
    Number_ptan # bg
-    Number_count # bg
+    Number_count # bg, U20
+    Number_tri # U20
    NumType_card
    NumType_dist
    NumType_frac
@ -197,7 +212,8 @@ cpdef enum symbol_t:
    PronType_rel
    PronType_tot
    PronType_clit
-    PronType_exc # es, ca, it, fa
+    PronType_exc # es, ca, it, fa, U20
+    PronType_emp # U20
    Reflex_yes
    Tense_fut
    Tense_imp
@ -213,12 +229,17 @@ cpdef enum symbol_t:
    VerbForm_partPres
    VerbForm_sup
    VerbForm_trans
+    VerbForm_conv # U20
    VerbForm_gdv # la
+    VerbForm_vnoun # U20
    Voice_act
    Voice_cau
    Voice_pass
-    Voice_mid # gkc
+    Voice_mid # gkc, U20
    Voice_int # hb
+    Voice_antip # U20
+    Voice_dir # U20
+    Voice_inv # U20
    Abbr_yes # cz, fi, sl, U
    AdpType_prep # cz, U
    AdpType_post # U
@ -284,6 +305,10 @@ cpdef enum symbol_t:
    Number_psee_plur # U
    Number_psor_sing # cz, fi, sl, U
    Number_psor_plur # cz, fi, sl, U
+    Number_pauc # U20
+    Number_grpa # U20
+    Number_grpl # U20
+    Number_inv # U20
    NumForm_digit # cz, sl, U
    NumForm_roman # cz, sl, U
    NumForm_word # cz, sl, U
@ -311,6 +336,8 @@ cpdef enum symbol_t:
    Person_psor_one # fi, U
    Person_psor_two # fi, U
    Person_psor_three # fi, U
+    Person_zero # U20
+    Person_four # U20
    Polite_inf # bq, U
    Polite_pol # bq, U
    Polite_abs_inf # bq, U
@ -319,6 +346,10 @@ cpdef enum symbol_t:
    Polite_erg_pol # bq, U
    Polite_dat_inf # bq, U
    Polite_dat_pol # bq, U
+    Polite_infm # U20
+    Polite_form # U20
+    Polite_form_elev # U20
+    Polite_form_humb # U20
    Prefix_yes # U
    PrepCase_npr # cz
    PrepCase_pre # U
@ -383,6 +414,7 @@ cpdef enum symbol_t:
    ccomp
    complm
    conj
+    cop # U20
    csubj
    csubjpass
    dep
@ -405,6 +437,8 @@ cpdef enum symbol_t:
    num
    number
    oprd
+    obj # U20
+    obl # U20
    parataxis
    partmod
    pcomp
--- a/spacy/symbols.pyx
+++ b/spacy/symbols.pyx
@ -91,6 +91,7 @@ IDS = {
    "ADV": ADV,
    "AUX": AUX,
    "CONJ": CONJ,
+    "CCONJ": CCONJ, # U20
    "DET": DET,
    "INTJ": INTJ,
    "NOUN": NOUN,
@ -108,11 +109,14 @@ IDS = {

    "Animacy_anim": Animacy_anim,
    "Animacy_inam": Animacy_inam,
+    "Animacy_hum": Animacy_hum, # U20
    "Aspect_freq": Aspect_freq,
    "Aspect_imp": Aspect_imp,
    "Aspect_mod": Aspect_mod,
    "Aspect_none": Aspect_none,
    "Aspect_perf": Aspect_perf,
+    "Aspect_iter": Aspect_iter, # U20
+    "Aspect_hab": Aspect_hab, # U20
    "Case_abe": Case_abe,
    "Case_abl": Case_abl,
    "Case_abs": Case_abs,
@ -121,10 +125,12 @@ IDS = {
    "Case_all": Case_all,
    "Case_cau": Case_cau,
    "Case_com": Case_com,
+    "Case_cmp": Case_cmp, # U20
    "Case_dat": Case_dat,
    "Case_del": Case_del,
    "Case_dis": Case_dis,
    "Case_ela": Case_ela,
+    "Case_equ": Case_equ, # U20
    "Case_ess": Case_ess,
    "Case_gen": Case_gen,
    "Case_ill": Case_ill,
@ -143,7 +149,9 @@ IDS = {
    "Definite_two": Definite_two,
    "Definite_def": Definite_def,
    "Definite_red": Definite_red,
+    "Definite_cons": Definite_cons, # U20
    "Definite_ind": Definite_ind,
+    "Definite_spec": Definite_spec, # U20
    "Degree_cmp": Degree_cmp,
    "Degree_comp": Degree_comp,
    "Degree_none": Degree_none,
@ -152,6 +160,8 @@ IDS = {
    "Degree_abs": Degree_abs,
    "Degree_com": Degree_com,
    "Degree_dim ": Degree_dim, # du
+    "Degree_equ": Degree_equ, # U20
+    "Evident_nfh": Evident_nfh, # U20
    "Gender_com": Gender_com,
    "Gender_fem": Gender_fem,
    "Gender_masc": Gender_masc,
@ -163,16 +173,21 @@ IDS = {
    "Mood_pot": Mood_pot,
    "Mood_sub": Mood_sub,
    "Mood_opt": Mood_opt,
+    "Mood_prp": Mood_prp, # U20
+    "Mood_adm": Mood_adm, # U20
    "Negative_neg": Negative_neg,
    "Negative_pos": Negative_pos,
    "Negative_yes": Negative_yes,
+    "Polarity_neg": Polarity_neg, # U20
+    "Polarity_pos": Polarity_pos, # U20
    "Number_com": Number_com,
    "Number_dual": Number_dual,
    "Number_none": Number_none,
    "Number_plur": Number_plur,
    "Number_sing": Number_sing,
    "Number_ptan ": Number_ptan, # bg
-    "Number_count ": Number_count, # bg
+    "Number_count ": Number_count, # bg, U20
+    "Number_tri": Number_tri, # U20
    "NumType_card": NumType_card,
    "NumType_dist": NumType_dist,
    "NumType_frac": NumType_frac,
@ -198,7 +213,8 @@ IDS = {
    "PronType_rel": PronType_rel,
    "PronType_tot": PronType_tot,
    "PronType_clit": PronType_clit,
-    "PronType_exc ": PronType_exc, # es, ca, it, fa,
+    "PronType_exc": PronType_exc, # es, ca, it, fa, U20
+    "PronType_emp": PronType_emp, # U20
    "Reflex_yes": Reflex_yes,
    "Tense_fut": Tense_fut,
    "Tense_imp": Tense_imp,
@ -214,12 +230,17 @@ IDS = {
    "VerbForm_partPres": VerbForm_partPres,
    "VerbForm_sup": VerbForm_sup,
    "VerbForm_trans": VerbForm_trans,
+    "VerbForm_conv": VerbForm_conv, # U20
    "VerbForm_gdv ": VerbForm_gdv, # la,
+    "VerbForm_vnoun": VerbForm_vnoun, # U20
    "Voice_act": Voice_act,
    "Voice_cau": Voice_cau,
    "Voice_pass": Voice_pass,
-    "Voice_mid ": Voice_mid, # gkc,
+    "Voice_mid ": Voice_mid, # gkc, U20
    "Voice_int ": Voice_int, # hb,
+    "Voice_antip": Voice_antip, # U20
+    "Voice_dir": Voice_dir, # U20
+    "Voice_inv": Voice_inv, # U20
    "Abbr_yes ": Abbr_yes, # cz, fi, sl, U,
    "AdpType_prep ": AdpType_prep, # cz, U,
    "AdpType_post ": AdpType_post, # U,
@ -285,6 +306,10 @@ IDS = {
    "Number_psee_plur ": Number_psee_plur, # U,
    "Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U,
    "Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U,
+    "Number_pauc": Number_pauc, # U20
+    "Number_grpa": Number_grpa, # U20
+    "Number_grpl": Number_grpl, # U20
+    "Number_inv": Number_inv, # U20
    "NumForm_digit ": NumForm_digit, # cz, sl, U,
    "NumForm_roman ": NumForm_roman, # cz, sl, U,
    "NumForm_word ": NumForm_word, # cz, sl, U,
@ -312,6 +337,8 @@ IDS = {
    "Person_psor_one ": Person_psor_one, # fi, U,
    "Person_psor_two ": Person_psor_two, # fi, U,
    "Person_psor_three ": Person_psor_three, # fi, U,
+    "Person_zero ": Person_zero, # U20
+    "Person_four ": Person_four, # U20
    "Polite_inf ": Polite_inf, # bq, U,
    "Polite_pol ": Polite_pol, # bq, U,
    "Polite_abs_inf ": Polite_abs_inf, # bq, U,
@ -320,6 +347,10 @@ IDS = {
    "Polite_erg_pol ": Polite_erg_pol, # bq, U,
    "Polite_dat_inf ": Polite_dat_inf, # bq, U,
    "Polite_dat_pol ": Polite_dat_pol, # bq, U,
+    "Polite_infm ": Polite_infm, # U20
+    "Polite_form ": Polite_form, # U20
+    "Polite_form_elev ": Polite_form_elev, # U20
+    "Polite_form_humb ": Polite_form_humb, # U20
    "Prefix_yes ": Prefix_yes, # U,
    "PrepCase_npr ": PrepCase_npr, # cz,
    "PrepCase_pre ": PrepCase_pre, # U,
@ -384,6 +415,7 @@ IDS = {
    "ccomp": ccomp,
    "complm": complm,
    "conj": conj,
+    "cop": cop, # U20
    "csubj": csubj,
    "csubjpass": csubjpass,
    "dep": dep,
@ -406,6 +438,8 @@ IDS = {
    "num": num,
    "number": number,
    "oprd": oprd,
+    "obj": obj, # U20
+    "obl": obl, # U20
    "parataxis": parataxis,
    "partmod": partmod,
    "pcomp": pcomp,
--- a/spacy/syntax/nonproj.pyx
+++ b/spacy/syntax/nonproj.pyx
@ -8,7 +8,7 @@ from spacy.attrs import DEP, HEAD
 def ancestors(tokenid, heads):
    # returns all words going from the word up the path to the root
    # the path to root cannot be longer than the number of words in the sentence
-    # this function ends after at most len(heads) steps 
+    # this function ends after at most len(heads) steps
    # because it would otherwise loop indefinitely on cycles
    head = tokenid
    cnt = 0
@ -180,7 +180,7 @@ class PseudoProjectivity:
            next_queue = []
            for qtoken in queue:
                for child in qtoken.children:
-                    if child.is_space: continue                        
+                    if child.is_space: continue
                    if child == token: continue
                    if child.dep_ == headlabel:
                        return child
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@ -13,13 +13,13 @@ from thinc.linalg cimport VecVec
 from .typedefs cimport attr_t
 from .tokens.doc cimport Doc
 from .attrs cimport TAG
-from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON
+from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CCONJ, DET, NOUN, NUM, PRON
 from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE
 from .gold cimport GoldParse

 from .attrs cimport *

- 
+
 cpdef enum:
    P2_orth
    P2_cluster
@ -71,7 +71,7 @@ cpdef enum:

 cdef class TaggerModel(AveragedPerceptron):
    cdef void set_featuresC(self, ExampleC* eg, const TokenC* tokens, int i) except *:
-        
+
        _fill_from_token(&eg.atoms[P2_orth], &tokens[i-2])
        _fill_from_token(&eg.atoms[P1_orth], &tokens[i-1])
        _fill_from_token(&eg.atoms[W_orth], &tokens[i])
@ -191,7 +191,7 @@ cdef class Tagger:
                                  nr_class=self.vocab.morphology.n_tags,
                                  nr_feat=self.model.nr_feat)
        for i in range(tokens.length):
-            if tokens.c[i].pos == 0:                
+            if tokens.c[i].pos == 0:
                self.model.set_featuresC(&eg.c, tokens.c, i)
                self.model.set_scoresC(eg.c.scores,
                    eg.c.features, eg.c.nr_feat)
@ -217,7 +217,7 @@ cdef class Tagger:
        for doc in stream:
            self(doc)
            yield doc
-    
+
    def update(self, Doc tokens, GoldParse gold):
        """Update the statistical model, with tags supplied for the given document.

@ -251,7 +251,7 @@ cdef class Tagger:
            self.model.updateC(&eg.c)

            self.vocab.morphology.assign_tag_id(&tokens.c[i], eg.guess)
-            
+
            correct += eg.cost == 0
            self.freqs[TAG][tokens.c[i].tag] += 1
            eg.fill_scores(0, eg.c.nr_class)
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -16,7 +16,7 @@ from ..typedefs cimport attr_t, flags_t
 from ..attrs cimport attr_id_t
 from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
 from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
-from ..parts_of_speech cimport CONJ, PUNCT, NOUN
+from ..parts_of_speech cimport CCONJ, PUNCT, NOUN
 from ..parts_of_speech cimport univ_pos_t
 from ..lexeme cimport Lexeme
 from .span cimport Span
@ -59,13 +59,13 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:

 cdef class Doc:
    """
-    A sequence of `Token` objects. Access sentences and named entities, 
-    export annotations to numpy arrays, losslessly serialize to compressed 
+    A sequence of `Token` objects. Access sentences and named entities,
+    export annotations to numpy arrays, losslessly serialize to compressed
    binary strings.

    Aside: Internals
-        The `Doc` object holds an array of `TokenC` structs. 
-        The Python-level `Token` and `Span` objects are views of this 
+        The `Doc` object holds an array of `TokenC` structs.
+        The Python-level `Token` and `Span` objects are views of this
        array, i.e. they don't own the data themselves.

    Code: Construction 1
@ -80,13 +80,13 @@ cdef class Doc:
        Create a Doc object.

        Aside: Implementation
-            This method of constructing a `Doc` object is usually only used 
-            for deserialization. Standard usage is to construct the document via 
+            This method of constructing a `Doc` object is usually only used
+            for deserialization. Standard usage is to construct the document via
            a call to the language object.

        Arguments:
            vocab:
-                A Vocabulary object, which must match any models you want to 
+                A Vocabulary object, which must match any models you want to
                use (e.g. tokenizer, parser, entity recognizer).

            words:
@ -156,19 +156,19 @@ cdef class Doc:
        if self.length == 0:
            self.is_tagged = True
            self.is_parsed = True
-    
+
    def __getitem__(self, object i):
        '''
        doc[i]
-            Get the Token object at position i, where i is an integer. 
-            Negative indexing is supported, and follows the usual Python 
+            Get the Token object at position i, where i is an integer.
+            Negative indexing is supported, and follows the usual Python
            semantics, i.e. doc[-2] is doc[len(doc) - 2].
        doc[start : end]]
            Get a `Span` object, starting at position `start`
            and ending at position `end`, where `start` and
            `end` are token indices. For instance,
-            `doc[2:5]` produces a span consisting of 
-            tokens 2, 3 and 4. Stepped slices (e.g. `doc[start : end : step]`) 
+            `doc[2:5]` produces a span consisting of
+            tokens 2, 3 and 4. Stepped slices (e.g. `doc[start : end : step]`)
            are not supported, as `Span` objects must be contiguous (cannot have gaps).
            You can use negative indices and open-ended ranges, which have their
            normal Python semantics.
@ -188,11 +188,11 @@ cdef class Doc:
    def __iter__(self):
        '''
        for token in doc
-            Iterate over `Token`  objects, from which the annotations can 
-            be easily accessed. This is the main way of accessing Token 
-            objects, which are the main way annotations are accessed from 
-            Python. If faster-than-Python speeds are required, you can 
-            instead access the annotations as a numpy array, or access the 
+            Iterate over `Token`  objects, from which the annotations can
+            be easily accessed. This is the main way of accessing Token
+            objects, which are the main way annotations are accessed from
+            Python. If faster-than-Python speeds are required, you can
+            instead access the annotations as a numpy array, or access the
            underlying C data directly from Cython.
        '''
        cdef int i
@ -251,13 +251,13 @@ cdef class Doc:
        def __get__(self):
            if 'has_vector' in self.user_hooks:
                return self.user_hooks['has_vector'](self)
- 
+
            return any(token.has_vector for token in self)

    property vector:
        '''
        A real-valued meaning representation. Defaults to an average of the token vectors.
-        
+
        Type: numpy.ndarray[ndim=1, dtype='float32']
        '''
        def __get__(self):
@ -285,14 +285,14 @@ cdef class Doc:
                    norm += value * value
                self._vector_norm = sqrt(norm) if norm != 0 else 0
            return self._vector_norm
-        
+
        def __set__(self, value):
-            self._vector_norm = value 
+            self._vector_norm = value

    @property
    def string(self):
        return self.text
-    
+
    property text:
        '''A unicode representation of the document text.'''
        def __get__(self):
@ -306,7 +306,7 @@ cdef class Doc:
    property ents:
        '''
        Yields named-entity `Span` objects, if the entity recognizer
-        has been applied to the document. Iterate over the span to get 
+        has been applied to the document. Iterate over the span to get
        individual Token objects, or access the label:

        Example:
@ -352,7 +352,7 @@ cdef class Doc:
            cdef int i
            for i in range(self.length):
                self.c[i].ent_type = 0
-                # At this point we don't know whether the NER has run over the 
+                # At this point we don't know whether the NER has run over the
                # Doc. If the ent_iob is missing, leave it missing.
                if self.c[i].ent_iob != 0:
                    self.c[i].ent_iob = 2 # Means O. Non-O are set from ents.
@ -384,9 +384,9 @@ cdef class Doc:
    property noun_chunks:
        '''
        Yields base noun-phrase #[code Span] objects, if the document
-        has been syntactically parsed. A base noun phrase, or 
-        'NP chunk', is a noun phrase that does not permit other NPs to 
-        be nested within it – so no NP-level coordination, no prepositional 
+        has been syntactically parsed. A base noun phrase, or
+        'NP chunk', is a noun phrase that does not permit other NPs to
+        be nested within it – so no NP-level coordination, no prepositional
        phrases, and no relative clauses. For example:
        '''
        def __get__(self):
@ -422,7 +422,7 @@ cdef class Doc:
        def __get__(self):
            if 'sents' in self.user_hooks:
                return self.user_hooks['sents'](self)
- 
+
            if not self.is_parsed:
                raise ValueError(
                    "sentence boundary detection requires the dependency parse, which "
@ -465,8 +465,8 @@ cdef class Doc:
    @cython.boundscheck(False)
    cpdef np.ndarray to_array(self, object py_attr_ids):
        """
-        Given a list of M attribute IDs, export the tokens to a numpy 
-        `ndarray` of shape (N, M), where `N` is the length 
+        Given a list of M attribute IDs, export the tokens to a numpy
+        `ndarray` of shape (N, M), where `N` is the length
        of the document. The values will be 32-bit integers.

        Example:
@ -474,7 +474,7 @@ cdef class Doc:
            doc = nlp(text)
            # All strings mapped to integers, for easy export to numpy
            np_array = doc.to_array([attrs.LOWER, attrs.POS, attrs.ENT_TYPE, attrs.IS_ALPHA])
-                
+
        Arguments:
            attr_ids (list[int]): A list of attribute ID ints.

@ -520,7 +520,7 @@ cdef class Doc:
        cdef int i
        cdef attr_t attr
        cdef size_t count
-        
+
        if counts is None:
            counts = PreshCounter()
            output_dict = True
@ -570,7 +570,7 @@ cdef class Doc:
        cdef TokenC* tokens = self.c
        cdef int length = len(array)
        cdef attr_t[:] values
-        for col, attr_id in enumerate(attrs): 
+        for col, attr_id in enumerate(attrs):
            values = array[:, col]
            if attr_id == HEAD:
                for i in range(length):
@ -612,11 +612,11 @@ cdef class Doc:
        '''Deserialize, loading from bytes.'''
        self.vocab.serializer.unpack_into(data[4:], self)
        return self
-    
+
    @staticmethod
    def read_bytes(file_):
        '''
-        A static method, used to read serialized #[code Doc] objects from 
+        A static method, used to read serialized #[code Doc] objects from
        a file. For example:

        Example:
@ -673,7 +673,7 @@ cdef class Doc:
                "Expected either 3 arguments (deprecated), or 0 (use keyword arguments). "
                "Arguments supplied:\n%s\n"
                "Keyword arguments:%s\n" % (len(args), repr(args), repr(attributes)))
- 
+
        cdef int start = token_by_start(self.c, self.length, start_idx)
        if start == -1:
            return None
@ -784,7 +784,7 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
            if child.l_edge < head.l_edge:
                head.l_edge = child.l_edge
            head.l_kids += 1
-        
+
    # Set right edges --- same as above, but iterate in reverse
    for i in range(length-1, -1, -1):
        child = &tokens[i]
@ -798,4 +798,4 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
    for i in range(length):
        if tokens[i].head == 0 and tokens[i].dep != 0:
            tokens[tokens[i].l_edge].sent_start = True
-            
+
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -20,7 +20,7 @@ from .. import parts_of_speech
 from ..attrs cimport LEMMA
 from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
 from ..attrs cimport POS, LEMMA, TAG, DEP
-from ..parts_of_speech cimport CONJ, PUNCT
+from ..parts_of_speech cimport CCONJ, PUNCT

 from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
 from ..attrs cimport IS_BRACKET
@ -84,7 +84,7 @@ cdef class Token:

    cpdef bint check_flag(self, attr_id_t flag_id) except -1:
        '''Check the value of a boolean flag.
-        
+
        Arguments:
            flag_id (int): The ID of the flag attribute.
        Returns:
@ -225,7 +225,7 @@ cdef class Token:
    property vector:
        '''
        A real-valued meaning representation.
-        
+
        Type: numpy.ndarray[ndim=1, dtype='float32']
        '''
        def __get__(self):
@ -343,7 +343,7 @@ cdef class Token:
        '''
        def __get__(self):
            cdef const TokenC* head_ptr = self.c
-            # guard against infinite loop, no token can have 
+            # guard against infinite loop, no token can have
            # more ancestors than tokens in the tree
            cdef int i = 0
            while head_ptr.head != 0 and i < self.doc.length:
@ -370,7 +370,7 @@ cdef class Token:

    property head:
        '''The syntactic parent, or "governor", of this token.
-        
+
        Returns: Token
        '''
        def __get__(self):
@ -390,7 +390,7 @@ cdef class Token:

            # is the new head a descendant of the old head
            cdef bint is_desc = old_head.is_ancestor_of(new_head)
-            
+
            cdef int new_edge
            cdef Token anc, child

@ -420,7 +420,7 @@ cdef class Token:
                        if anc.c.l_edge <= new_edge:
                            break
                        anc.c.l_edge = new_edge
-            
+
            elif self.c.head < 0: # right dependent
                old_head.c.r_kids -= 1
                # do the same thing as for l_edge
@ -435,7 +435,7 @@ cdef class Token:
                            if child.c.r_edge > new_edge:
                                new_edge = child.c.r_edge
                        old_head.c.r_edge = new_edge
-                    
+
                    for anc in old_head.ancestors:
                        if anc.c.r_edge >= new_edge:
                            break
@ -598,19 +598,19 @@ cdef class Token:
    property is_punct:
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_PUNCT)

-    property is_space: 
+    property is_space:
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_SPACE)
-    
-    property is_bracket: 
+
+    property is_bracket:
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_BRACKET)

-    property is_quote: 
+    property is_quote:
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_QUOTE)

-    property is_left_punct: 
+    property is_left_punct:
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT)

-    property is_right_punct: 
+    property is_right_punct:
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT)

    property like_url: