Merge remote-tracking branch 'upstream/master' into bugfix/tokenizer-special-cases-matcher

2025-10-29 23:17:59 +03:00 · 2019-09-08 21:30:01 +02:00 · 2019-09-08 21:30:01 +02:00 · 64f86b7e97
commit 64f86b7e97
parent d1679819ab 3780e2ff50
16 changed files with 1567214 additions and 118 deletions
--- a/spacy/lang/hr/init.py
+++ b/spacy/lang/hr/init.py
@ -18,6 +18,7 @@ class CroatianDefaults(Language.Defaults):
    )
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
    stop_words = STOP_WORDS
+    resources = {"lemma_lookup": "lemma_lookup.json"}


 class Croatian(Language):
--- a/spacy/lang/hr/lemma_lookup.json
+++ b/spacy/lang/hr/lemma_lookup.json
--- a/spacy/lang/hr/lemma_lookup_license.txt
+++ b/spacy/lang/hr/lemma_lookup_license.txt
@ -0,0 +1,15 @@
+The list of Croatian lemmas was extracted from the reldi-tagger repository (https://github.com/clarinsi/reldi-tagger).
+Reldi-tagger is licesned under the Apache 2.0 licence.
+
+@InProceedings{ljubesic16-new,
+  author = {Nikola Ljubešić and Filip Klubička and Željko Agić and Ivo-Pavao Jazbec},
+  title = {New Inflectional Lexicons and Training Corpora for Improved Morphosyntactic Annotation of Croatian and Serbian},
+  booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016)},
+  year = {2016},
+  date = {23-28},
+  location = {Portorož, Slovenia},
+  editor = {Nicoletta Calzolari (Conference Chair) and Khalid Choukri and Thierry Declerck and Sara Goggi and Marko Grobelnik and Bente Maegaard and Joseph Mariani and Helene Mazo and Asuncion Moreno and Jan Odijk and Stelios Piperidis},
+  publisher = {European Language Resources Association (ELRA)},
+  address = {Paris, France},
+  isbn = {978-2-9517408-9-1}
+ }
--- a/spacy/lang/sr/init.py
+++ b/spacy/lang/sr/init.py
@ -21,6 +21,7 @@ class SerbianDefaults(Language.Defaults):
    )
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
+    resources = {"lemma_lookup": "lemma_lookup.json"}


 class Serbian(Language):
--- a/spacy/lang/sr/examples.py
+++ b/spacy/lang/sr/examples.py
@ -12,13 +12,14 @@ Example sentences to test spaCy and its language models.

 sentences = [
    # Translations from English
-    "Apple планира куповину америчког стартапа за $1 милијарду."
+    "Apple планира куповину америчког стартапа за $1 милијарду.",
    "Беспилотни аутомобили пребацују одговорност осигурања на произвођаче.",
    "Лондон је велики град у Уједињеном Краљевству.",
    "Где си ти?",
    "Ко је председник Француске?",
    # Serbian common and slang
    "Moj ћале је инжењер!",
-    "Новак Ђоковић је најбољи тенисер света." "У Пироту има добрих кафана!",
+    "Новак Ђоковић је најбољи тенисер света.",
+    "У Пироту има добрих кафана!",
    "Музеј Николе Тесле се налази у Београду.",
 ]
--- a/spacy/lang/sr/lemma_lookup.json
+++ b/spacy/lang/sr/lemma_lookup.json
--- a/spacy/lang/sr/lemma_lookup_licence.txt
+++ b/spacy/lang/sr/lemma_lookup_licence.txt
@ -0,0 +1,32 @@
+Copyright @InProceedings{ljubesic16-new,
+  author = {Nikola Ljubešić and Filip Klubička and Željko Agić and Ivo-Pavao Jazbec},
+  title = {New Inflectional Lexicons and Training Corpora for Improved Morphosyntactic Annotation of Croatian and Serbian},
+  booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016)},
+  year = {2016},
+  date = {23-28},
+  location = {Portorož, Slovenia},
+  editor = {Nicoletta Calzolari (Conference Chair) and Khalid Choukri and Thierry Declerck and Sara Goggi and Marko Grobelnik and Bente Maegaard and Joseph Mariani and Helene Mazo and Asuncion Moreno and Jan Odijk and Stelios Piperidis},
+  publisher = {European Language Resources Association (ELRA)},
+  address = {Paris, France},
+  isbn = {978-2-9517408-9-1}
+ }
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+
+The licence of Serbian lemmas was adopted from Serbian lexicon:
+ - sr.lexicon (https://github.com/clarinsi/reldi-tagger/blob/master/sr.lexicon)
+
+Changelog:
+ - Lexicon is translated into cyrilic
+ - Word order is sorted
--- a/spacy/lang/sr/tokenizer_exceptions.py
+++ b/spacy/lang/sr/tokenizer_exceptions.py
@ -15,6 +15,7 @@ _abbrev_exc = [
    {ORTH: "пет", LEMMA: "петак", NORM: "петак"},
    {ORTH: "суб", LEMMA: "субота", NORM: "субота"},
    {ORTH: "нед", LEMMA: "недеља", NORM: "недеља"},
+
    # Months abbreviations
    {ORTH: "јан", LEMMA: "јануар", NORM: "јануар"},
    {ORTH: "феб", LEMMA: "фебруар", NORM: "фебруар"},
@ -27,7 +28,7 @@ _abbrev_exc = [
    {ORTH: "септ", LEMMA: "септембар", NORM: "септембар"},
    {ORTH: "окт", LEMMA: "октобар", NORM: "октобар"},
    {ORTH: "нов", LEMMA: "новембар", NORM: "новембар"},
-    {ORTH: "дец", LEMMA: "децембар", NORM: "децембар"},
+    {ORTH: "дец", LEMMA: "децембар", NORM: "децембар"}
 ]


--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -103,6 +103,11 @@ def he_tokenizer():
    return get_lang_class("he").Defaults.create_tokenizer()


+@pytest.fixture(scope="session")
+def hr_tokenizer():
+    return get_lang_class("hr").Defaults.create_tokenizer()
+
+
@pytest.fixture
 def hu_tokenizer():
    return get_lang_class("hu").Defaults.create_tokenizer()
--- a/spacy/tests/doc/test_retokenize_merge.py
+++ b/spacy/tests/doc/test_retokenize_merge.py
@ -99,6 +99,41 @@ def test_doc_retokenize_spans_merge_tokens(en_tokenizer):
    assert doc[0].ent_type_ == "GPE"


+def test_doc_retokenize_spans_merge_tokens_default_attrs(en_tokenizer):
+    text = "The players start."
+    heads = [1, 1, 0, -1]
+    tokens = en_tokenizer(text)
+    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=["DT", "NN", "VBZ", "."], pos=["DET", "NOUN", "VERB", "PUNCT"], heads=heads)
+    assert len(doc) == 4
+    assert doc[0].text == "The"
+    assert doc[0].tag_ == "DT"
+    assert doc[0].pos_ == "DET"
+    with doc.retokenize() as retokenizer:
+        retokenizer.merge(doc[0:2])
+    assert len(doc) == 3
+    assert doc[0].text == "The players"
+    assert doc[0].tag_ == "NN"
+    assert doc[0].pos_ == "NOUN"
+    assert doc[0].lemma_ == "The players"
+    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=["DT", "NN", "VBZ", "."], pos=["DET", "NOUN", "VERB", "PUNCT"], heads=heads)
+    assert len(doc) == 4
+    assert doc[0].text == "The"
+    assert doc[0].tag_ == "DT"
+    assert doc[0].pos_ == "DET"
+    with doc.retokenize() as retokenizer:
+        retokenizer.merge(doc[0:2])
+        retokenizer.merge(doc[2:4])
+    assert len(doc) == 2
+    assert doc[0].text == "The players"
+    assert doc[0].tag_ == "NN"
+    assert doc[0].pos_ == "NOUN"
+    assert doc[0].lemma_ == "The players"
+    assert doc[1].text == "start ."
+    assert doc[1].tag_ == "VBZ"
+    assert doc[1].pos_ == "VERB"
+    assert doc[1].lemma_ == "start ."
+
+
 def test_doc_retokenize_spans_merge_heads(en_tokenizer):
    text = "I found a pilates class near work."
    heads = [1, 0, 2, 1, -3, -1, -1, -6]
@ -182,7 +217,7 @@ def test_doc_retokenize_spans_entity_merge(en_tokenizer):
    assert len(doc) == 15


-def test_doc_retokenize_spans_entity_merge_iob():
+def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
    # Test entity IOB stays consistent after merging
    words = ["a", "b", "c", "d", "e"]
    doc = Doc(Vocab(), words=words)
@ -195,10 +230,23 @@ def test_doc_retokenize_spans_entity_merge_iob():
    assert doc[2].ent_iob_ == "I"
    assert doc[3].ent_iob_ == "B"
    with doc.retokenize() as retokenizer:
-        retokenizer.merge(doc[0:1])
+        retokenizer.merge(doc[0:2])
+    assert len(doc) == len(words) - 1
    assert doc[0].ent_iob_ == "B"
    assert doc[1].ent_iob_ == "I"

+    # Test that IOB stays consistent with provided IOB
+    words = ["a", "b", "c", "d", "e"]
+    doc = Doc(Vocab(), words=words)
+    with doc.retokenize() as retokenizer:
+        attrs = {"ent_type": "ent-abc", "ent_iob": 1}
+        retokenizer.merge(doc[0:3], attrs=attrs)
+        retokenizer.merge(doc[3:5], attrs=attrs)
+    assert doc[0].ent_iob_ == "B"
+    assert doc[1].ent_iob_ == "I"
+
+    # if no parse/heads, the first word in the span is the root and provides
+    # default values
    words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
    doc = Doc(Vocab(), words=words)
    doc.ents = [
@ -215,7 +263,53 @@ def test_doc_retokenize_spans_entity_merge_iob():
        retokenizer.merge(doc[7:9])
    assert len(doc) == 6
    assert doc[3].ent_iob_ == "B"
-    assert doc[4].ent_iob_ == "I"
+    assert doc[3].ent_type_ == "ent-de"
+    assert doc[4].ent_iob_ == "B"
+    assert doc[4].ent_type_ == "ent-fg"
+
+    # if there is a parse, span.root provides default values
+    words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
+    heads = [ 0,  -1,   1,  -3,  -4,  -5,  -1,  -7,  -8 ]
+    ents =  [
+        (3, 5, "ent-de"),
+        (5, 7, "ent-fg"),
+    ]
+    deps =  ["dep"] * len(words)
+    en_vocab.strings.add("ent-de")
+    en_vocab.strings.add("ent-fg")
+    en_vocab.strings.add("dep")
+    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
+    assert doc[2:4].root == doc[3] # root of 'c d' is d
+    assert doc[4:6].root == doc[4] # root is 'e f' is e
+    with doc.retokenize() as retokenizer:
+        retokenizer.merge(doc[2:4])
+        retokenizer.merge(doc[4:6])
+        retokenizer.merge(doc[7:9])
+    assert len(doc) == 6
+    assert doc[2].ent_iob_ == "B"
+    assert doc[2].ent_type_ == "ent-de"
+    assert doc[3].ent_iob_ == "I"
+    assert doc[3].ent_type_ == "ent-de"
+    assert doc[4].ent_iob_ == "B"
+    assert doc[4].ent_type_ == "ent-fg"
+
+    # check that B is preserved if span[start] is B
+    words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
+    heads = [ 0,  -1,   1,   1,  -4,  -5,  -1,  -7,  -8 ]
+    ents =  [
+        (3, 5, "ent-de"),
+        (5, 7, "ent-de"),
+    ]
+    deps =  ["dep"] * len(words)
+    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
+    with doc.retokenize() as retokenizer:
+        retokenizer.merge(doc[3:5])
+        retokenizer.merge(doc[5:7])
+    assert len(doc) == 7
+    assert doc[3].ent_iob_ == "B"
+    assert doc[3].ent_type_ == "ent-de"
+    assert doc[4].ent_iob_ == "B"
+    assert doc[4].ent_type_ == "ent-de"


 def test_doc_retokenize_spans_sentence_update_after_merge(en_tokenizer):
--- a/spacy/tests/lang/hr/test_lemma.py
+++ b/spacy/tests/lang/hr/test_lemma.py
@ -0,0 +1,20 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import pytest
+
+
+@pytest.mark.parametrize(
+    "string,lemma",
+    [
+        ("trčao", "trčati"),
+        ("adekvatnim", "adekvatan"),
+        ("dekontaminacijama", "dekontaminacija"),
+        ("filologovih", "filologov"),
+        ("je", "biti"),
+        ("se", "sebe"),
+    ],
+)
+def test_hr_lemmatizer_lookup_assigns(hr_tokenizer, string, lemma):
+    tokens = hr_tokenizer(string)
+    assert tokens[0].lemma_ == lemma
--- a/spacy/tests/lang/sr/test_lemmatizer.py
+++ b/spacy/tests/lang/sr/test_lemmatizer.py
@ -0,0 +1,20 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import pytest
+
+
+@pytest.mark.parametrize(
+    "string,lemma",
+    [
+        ("најадекватнији", "адекватан"),
+        ("матурирао", "матурирати"),
+        ("планираћемо", "планирати"),
+        ("певају", "певати"),
+        ("нама", "ми"),
+        ("се", "себе"),
+    ],
+)
+def test_sr_lemmatizer_lookup_assigns(sr_tokenizer, string, lemma):
+    tokens = sr_tokenizer(string)
+    assert tokens[0].lemma_ == lemma
--- a/spacy/tests/regression/test_issue1001-1500.py
+++ b/spacy/tests/regression/test_issue1001-1500.py
@ -13,7 +13,6 @@ from spacy.lemmatizer import Lemmatizer
 from spacy.symbols import ORTH, LEMMA, POS, VERB, VerbForm_part


-@pytest.mark.xfail
 def test_issue1061():
    '''Test special-case works after tokenizing. Was caching problem.'''
    text = 'I like _MATH_ even _MATH_ when _MATH_, except when _MATH_ is _MATH_! but not _MATH_.'
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@ -16,10 +16,10 @@ cdef class Tokenizer:
    cdef PreshMap _specials
    cpdef readonly Vocab vocab

-    cdef public object token_match
-    cdef public object prefix_search
-    cdef public object suffix_search
-    cdef public object infix_finditer
+    cdef object _token_match
+    cdef object _prefix_search
+    cdef object _suffix_search
+    cdef object _infix_finditer
    cdef object _rules
    cdef object _special_matcher

--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -63,6 +63,38 @@ cdef class Tokenizer:
        self._special_matcher = Matcher(self.vocab)
        self._load_special_cases(rules)

+    property token_match:
+        def __get__(self):
+            return self._token_match
+
+        def __set__(self, token_match):
+            self._token_match = token_match
+            self._flush_cache()
+
+    property prefix_search:
+        def __get__(self):
+            return self._prefix_search
+
+        def __set__(self, prefix_search):
+            self._prefix_search = prefix_search
+            self._flush_cache()
+
+    property suffix_search:
+        def __get__(self):
+            return self._suffix_search
+
+        def __set__(self, suffix_search):
+            self._suffix_search = suffix_search
+            self._flush_cache()
+
+    property infix_finditer:
+        def __get__(self):
+            return self._infix_finditer
+
+        def __set__(self, infix_finditer):
+            self._infix_finditer = infix_finditer
+            self._flush_cache()
+
    def __reduce__(self):
        args = (self.vocab,
                self._rules,
@ -153,9 +185,23 @@ cdef class Tokenizer:
        for text in texts:
            yield self(text)

+    def _flush_cache(self):
+        self._reset_cache([key for key in self._cache if not key in self._specials])
+
    def _reset_cache(self, keys):
        for k in keys:
            del self._cache[k]
+            if not k in self._specials:
+                cached = <_Cached*>self._cache.get(k)
+                if cached is not NULL:
+                    self.mem.free(cached)
+
+    def _reset_specials(self):
+        for k in self._specials:
+            cached = <_Cached*>self._specials.get(k)
+            del self._specials[k]
+            if cached is not NULL:
+                self.mem.free(cached)

    cdef int _apply_special_cases(self, Doc doc):
        """Retokenize doc according to special cases.
@ -409,7 +455,14 @@ cdef class Tokenizer:
        cached.is_lex = False
        cached.data.tokens = self.vocab.make_fused_token(substrings)
        key = hash_string(string)
+        stale_special = <_Cached*>self._specials.get(key)
+        stale_cached = <_Cached*>self._cache.get(key)
+        self._flush_cache()
        self._specials.set(key, cached)
+        if stale_special is not NULL:
+            self.mem.free(stale_special)
+        if stale_special != stale_cached and stale_cached is not NULL:
+            self.mem.free(stale_cached)
        self._rules[string] = substrings
        self._special_matcher.add(string, None, [{ORTH: token.text} for token in self._tokenize_affixes(string)])

@ -493,7 +546,10 @@ cdef class Tokenizer:
        if data.get("rules"):
            # make sure to hard reset the cache to remove data from the default exceptions
            self._rules = {}
+            self._reset_cache([key for key in self._cache])
+            self._reset_specials()
            self._cache = PreshMap()
+            self._specials = PreshMap()
            for string, substrings in data.get("rules", {}).items():
                self.add_special_case(string, substrings)

--- a/spacy/tokens/_retokenize.pyx
+++ b/spacy/tokens/_retokenize.pyx
@ -109,13 +109,8 @@ cdef class Retokenizer:

    def __exit__(self, *args):
        # Do the actual merging here
-        if len(self.merges) > 1:
-            _bulk_merge(self.doc, self.merges)
-        elif len(self.merges) == 1:
-            (span, attrs) = self.merges[0]
-            start = span.start
-            end = span.end
-            _merge(self.doc, start, end, attrs)
+        if len(self.merges) >= 1:
+            _merge(self.doc, self.merges)
        # Iterate in order, to keep things simple.
        for start_char, orths, heads, attrs in sorted(self.splits):
            # Resolve token index
@ -140,95 +135,7 @@ cdef class Retokenizer:
            _split(self.doc, token_index, orths, head_indices, attrs)


-def _merge(Doc doc, int start, int end, attributes):
-    """Retokenize the document, such that the span at
-    `doc.text[start_idx : end_idx]` is merged into a single token. If
-    `start_idx` and `end_idx `do not mark start and end token boundaries,
-    the document remains unchanged.
-    start_idx (int): Character index of the start of the slice to merge.
-    end_idx (int): Character index after the end of the slice to merge.
-    **attributes: Attributes to assign to the merged token. By default,
-        attributes are inherited from the syntactic root of the span.
-    RETURNS (Token): The newly merged token, or `None` if the start and end
-        indices did not fall at token boundaries.
-    """
-    cdef Span span = doc[start:end]
-    cdef int start_char = span.start_char
-    cdef int end_char = span.end_char
-    # Resize the doc.tensor, if it's set. Let the last row for each token stand
-    # for the merged region. To do this, we create a boolean array indicating
-    # whether the row is to be deleted, then use numpy.delete
-    if doc.tensor is not None and doc.tensor.size != 0:
-        doc.tensor = _resize_tensor(doc.tensor, [(start, end)])
-    # Get LexemeC for newly merged token
-    new_orth = ''.join([t.text_with_ws for t in span])
-    if span[-1].whitespace_:
-        new_orth = new_orth[:-len(span[-1].whitespace_)]
-    cdef const LexemeC* lex = doc.vocab.get(doc.mem, new_orth)
-    # House the new merged token where it starts
-    cdef TokenC* token = &doc.c[start]
-    token.spacy = doc.c[end-1].spacy
-    for attr_name, attr_value in attributes.items():
-        if attr_name == "_":  # Set extension attributes
-            for ext_attr_key, ext_attr_value in attr_value.items():
-                doc[start]._.set(ext_attr_key, ext_attr_value)
-        elif attr_name == TAG:
-            doc.vocab.morphology.assign_tag(token, attr_value)
-        else:
-            # Set attributes on both token and lexeme to take care of token
-            # attribute vs. lexical attribute without having to enumerate them.
-            # If an attribute name is not valid, set_struct_attr will ignore it.
-            Token.set_struct_attr(token, attr_name, attr_value)
-            Lexeme.set_struct_attr(<LexemeC*>lex, attr_name, attr_value)
-    # Make sure ent_iob remains consistent
-    if doc.c[end].ent_iob == 1 and token.ent_iob in (0, 2):
-        if token.ent_type == doc.c[end].ent_type:
-            token.ent_iob = 3
-        else:
-            # If they're not the same entity type, let them be two entities
-            doc.c[end].ent_iob = 3
-    # Begin by setting all the head indices to absolute token positions
-    # This is easier to work with for now than the offsets
-    # Before thinking of something simpler, beware the case where a
-    # dependency bridges over the entity. Here the alignment of the
-    # tokens changes.
-    span_root = span.root.i
-    token.dep = span.root.dep
-    # We update token.lex after keeping span root and dep, since
-    # setting token.lex will change span.start and span.end properties
-    # as it modifies the character offsets in the doc
-    token.lex = lex
-    for i in range(doc.length):
-        doc.c[i].head += i
-    # Set the head of the merged token, and its dep relation, from the Span
-    token.head = doc.c[span_root].head
-    # Adjust deps before shrinking tokens
-    # Tokens which point into the merged token should now point to it
-    # Subtract the offset from all tokens which point to >= end
-    offset = (end - start) - 1
-    for i in range(doc.length):
-        head_idx = doc.c[i].head
-        if start <= head_idx < end:
-            doc.c[i].head = start
-        elif head_idx >= end:
-            doc.c[i].head -= offset
-    # Now compress the token array
-    for i in range(end, doc.length):
-        doc.c[i - offset] = doc.c[i]
-    for i in range(doc.length - offset, doc.length):
-        memset(&doc.c[i], 0, sizeof(TokenC))
-        doc.c[i].lex = &EMPTY_LEXEME
-    doc.length -= offset
-    for i in range(doc.length):
-        # ...And, set heads back to a relative position
-        doc.c[i].head -= i
-    # Set the left/right children, left/right edges
-    set_children_from_heads(doc.c, doc.length)
-    # Return the merged Python object
-    return doc[start]
-
-
-def _bulk_merge(Doc doc, merges):
+def _merge(Doc doc, merges):
    """Retokenize the document, such that the spans described in 'merges'
     are merged into a single token. This method assumes that the merges
     are in the same order at which they appear in the doc, and that merges
@ -256,6 +163,26 @@ def _bulk_merge(Doc doc, merges):
        spans.append(span)
        # House the new merged token where it starts
        token = &doc.c[start]
+        # Initially set attributes to attributes of span root
+        token.tag = doc.c[span.root.i].tag
+        token.pos = doc.c[span.root.i].pos
+        token.morph = doc.c[span.root.i].morph
+        token.ent_iob = doc.c[span.root.i].ent_iob
+        token.ent_type = doc.c[span.root.i].ent_type
+        merged_iob = token.ent_iob
+        # If span root is part of an entity, merged token is B-ENT
+        if token.ent_iob in (1, 3):
+            merged_iob = 3
+            # If start token is I-ENT and previous token is of the same
+            # type, then I-ENT (could check I-ENT from start to span root)
+            if doc.c[start].ent_iob == 1 and start > 0 \
+                    and doc.c[start].ent_type == token.ent_type \
+                    and doc.c[start - 1].ent_type == token.ent_type:
+                merged_iob = 1
+        token.ent_iob = merged_iob
+        # Unset attributes that don't match new token
+        token.lemma = 0
+        token.norm = 0
        tokens[merge_index] = token
    # Resize the doc.tensor, if it's set. Let the last row for each token stand
    # for the merged region. To do this, we create a boolean array indicating
@ -351,17 +278,7 @@ def _bulk_merge(Doc doc, merges):
    # Set the left/right children, left/right edges
    set_children_from_heads(doc.c, doc.length)
    # Make sure ent_iob remains consistent
-    for (span, _) in merges:
-        if(span.end < len(offsets)):
-        # If it's not the last span
-            token_after_span_position = offsets[span.end]
-            if doc.c[token_after_span_position].ent_iob == 1\
-                    and doc.c[token_after_span_position - 1].ent_iob in (0, 2):
-                if doc.c[token_after_span_position - 1].ent_type == doc.c[token_after_span_position].ent_type:
-                    doc.c[token_after_span_position - 1].ent_iob = 3
-                else:
-                    # If they're not the same entity type, let them be two entities
-                    doc.c[token_after_span_position].ent_iob = 3
+    make_iob_consistent(doc.c, doc.length)
    # Return the merged Python object
    return doc[spans[0].start]

@ -480,3 +397,12 @@ def _validate_extensions(extensions):
            raise ValueError(Errors.E118.format(attr=key))
        if not is_writable_attr(extension):
            raise ValueError(Errors.E119.format(attr=key))
+
+
+cdef make_iob_consistent(TokenC* tokens, int length):
+    cdef int i
+    if tokens[0].ent_iob == 1:
+        tokens[0].ent_iob = 3
+    for i in range(1, length):
+        if tokens[i].ent_iob == 1 and tokens[i - 1].ent_type != tokens[i].ent_type:
+            tokens[i].ent_iob = 3