From 8aa7882762e9be1af8c42885bb874a89f2c730b3 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 8 Dec 2018 10:49:10 +0100
Subject: [PATCH] Make NORM a token attribute (#3029)

See #3028. The solution in this patch is pretty debateable.

What we do is give the TokenC struct a .norm field, by repurposing the previously idle .sense attribute. It's nice to repurpose a previous field because it means the TokenC doesn't change size, so even if someone's using the internals very deeply, nothing will break.

The weird thing here is that the TokenC and the LexemeC both have an attribute named NORM. This arguably assists in backwards compatibility. On the other hand, maybe it's really bad! We're changing the semantics of the attribute subtly, so maybe it's better if someone calling lex.norm gets a breakage, and instead is told to write lex.default_norm?

Overall I believe this patch makes the NORM feature work the way we sort of expected it to work. Certainly it's much more like how the docs describe it, and more in line with how we've been directing people to use the norm attribute. We'll also be able to use token.norm to do stuff like spelling correction, which is pretty cool.
---
 spacy/structs.pxd                        |  2 +-
 spacy/tests/regression/test_issue2754.py | 14 ++++++++++++++
 spacy/tokens/token.pxd                   |  7 +++++++
 spacy/tokens/token.pyx                   | 10 ++++++++--
 spacy/vocab.pyx                          |  7 +++++--
 5 files changed, 35 insertions(+), 5 deletions(-)
 create mode 100644 spacy/tests/regression/test_issue2754.py

diff --git a/spacy/structs.pxd b/spacy/structs.pxd
index cfcadc3d0..fa282cae7 100644
--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@@ -58,7 +58,7 @@ cdef struct TokenC:
     attr_t tag
     int idx
     attr_t lemma
-    attr_t sense
+    attr_t norm
     int head
     attr_t dep
 
diff --git a/spacy/tests/regression/test_issue2754.py b/spacy/tests/regression/test_issue2754.py
new file mode 100644
index 000000000..5f76727f8
--- /dev/null
+++ b/spacy/tests/regression/test_issue2754.py
@@ -0,0 +1,14 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import pytest
+from spacy.lang.en import English
+
+def test_issue2754():
+    """Test that words like 'a' and 'a.m.' don't get exceptional norm values."""
+    nlp = English()
+    a = nlp('a')
+    assert a[0].norm_ == 'a'
+    am = nlp('am')
+    assert am[0].norm_ == 'am'
+
diff --git a/spacy/tokens/token.pxd b/spacy/tokens/token.pxd
index 9b02d07fb..bb9f7d070 100644
--- a/spacy/tokens/token.pxd
+++ b/spacy/tokens/token.pxd
@@ -34,6 +34,11 @@ cdef class Token:
             return Lexeme.c_check_flag(token.lex, feat_name)
         elif feat_name == LEMMA:
             return token.lemma
+        elif feat_name == NORM:
+            if token.norm == 0:
+                return token.lex.norm
+            else:
+                return token.norm
         elif feat_name == POS:
             return token.pos
         elif feat_name == TAG:
@@ -58,6 +63,8 @@ cdef class Token:
                                        attr_t value) nogil:
         if feat_name == LEMMA:
             token.lemma = value
+        elif feat_name == NORM:
+            token.norm = value
         elif feat_name == POS:
             token.pos = <univ_pos_t>value
         elif feat_name == TAG:
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 5c8af1333..0266004b5 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -249,7 +249,10 @@ cdef class Token:
             or norm exceptions.
         """
         def __get__(self):
-            return self.c.lex.norm
+            if self.c.norm == 0:
+                return self.c.lex.norm
+            else:
+                return self.c.norm
 
     property shape:
         """RETURNS (uint64): ID of the token's shape, a transform of the
@@ -711,7 +714,10 @@ cdef class Token:
             norm exceptions.
         """
         def __get__(self):
-            return self.vocab.strings[self.c.lex.norm]
+            return self.vocab.strings[self.norm]
+
+        def __set__(self, unicode norm_):
+            self.c.norm = self.vocab.strings.add(norm_)
 
     property shape_:
         """RETURNS (unicode): Transform of the tokens's string, to show
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 42fd2f46e..e28aa0b86 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -17,7 +17,7 @@ from .structs cimport SerializedLexemeC
 from .compat import copy_reg, basestring_
 from .errors import Errors
 from .lemmatizer import Lemmatizer
-from .attrs import intify_attrs
+from .attrs import intify_attrs, NORM
 from .vectors import Vectors
 from ._ml import link_vectors_to_models
 from . import util
@@ -234,7 +234,10 @@ cdef class Vocab:
                 self.morphology.assign_tag(token, props[TAG])
             for attr_id, value in props.items():
                 Token.set_struct_attr(token, attr_id, value)
-                Lexeme.set_struct_attr(lex, attr_id, value)
+                # NORM is the only one that overlaps between the two
+                # (which is maybe not great?)
+                if attr_id != NORM:
+                    Lexeme.set_struct_attr(lex, attr_id, value)
         return tokens
 
     @property