From 6bda23ad26edc42102bbdbcda5c025972829f83c Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 25 Jun 2020 16:26:44 +0200
Subject: [PATCH] Improve handling of missing values in NER

---
 spacy/gold/iob_utils.py | 52 ++++++++++++++++++++++-------------------
 1 file changed, 28 insertions(+), 24 deletions(-)

diff --git a/spacy/gold/iob_utils.py b/spacy/gold/iob_utils.py
index e67cba7e0..a892308f9 100644
--- a/spacy/gold/iob_utils.py
+++ b/spacy/gold/iob_utils.py
@@ -91,31 +91,35 @@ def biluo_tags_from_offsets(doc, entities, missing="O"):
     biluo = ["-" for _ in doc]
     # Handle entity cases
     for start_char, end_char, label in entities:
-        for token_index in range(start_char, end_char):
-            if token_index in tokens_in_ents.keys():
-                raise ValueError(
-                    Errors.E103.format(
-                        span1=(
-                            tokens_in_ents[token_index][0],
-                            tokens_in_ents[token_index][1],
-                            tokens_in_ents[token_index][2],
-                        ),
-                        span2=(start_char, end_char, label),
+        if not label:
+            if start_char in starts:
+                biluo[starts[start_char]] = "O"
+        else:
+            for token_index in range(start_char, end_char):
+                if token_index in tokens_in_ents.keys():
+                    raise ValueError(
+                        Errors.E103.format(
+                            span1=(
+                                tokens_in_ents[token_index][0],
+                                tokens_in_ents[token_index][1],
+                                tokens_in_ents[token_index][2],
+                            ),
+                            span2=(start_char, end_char, label),
+                        )
                     )
-                )
-            tokens_in_ents[token_index] = (start_char, end_char, label)
+                tokens_in_ents[token_index] = (start_char, end_char, label)
 
-        start_token = starts.get(start_char)
-        end_token = ends.get(end_char)
-        # Only interested if the tokenization is correct
-        if start_token is not None and end_token is not None:
-            if start_token == end_token:
-                biluo[start_token] = f"U-{label}"
-            else:
-                biluo[start_token] = f"B-{label}"
-                for i in range(start_token + 1, end_token):
-                    biluo[i] = f"I-{label}"
-                biluo[end_token] = f"L-{label}"
+            start_token = starts.get(start_char)
+            end_token = ends.get(end_char)
+            # Only interested if the tokenization is correct
+            if start_token is not None and end_token is not None:
+                if start_token == end_token:
+                    biluo[start_token] = f"U-{label}"
+                else:
+                    biluo[start_token] = f"B-{label}"
+                    for i in range(start_token + 1, end_token):
+                        biluo[i] = f"I-{label}"
+                    biluo[end_token] = f"L-{label}"
     # Now distinguish the O cases from ones where we miss the tokenization
     entity_chars = set()
     for start_char, end_char, label in entities:
@@ -127,7 +131,7 @@ def biluo_tags_from_offsets(doc, entities, missing="O"):
                 break
         else:
             biluo[token.i] = missing
-    if "-" in biluo:
+    if "-" in biluo and missing != "-":
         ent_str = str(entities)
         warnings.warn(
             Warnings.W030.format(