From a89fecce97c06d7315bb955de1127025fa310b4b Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 11 Jul 2019 00:43:55 +0200
Subject: [PATCH 1/4] failing unit test for issue #3869

---
 spacy/tests/regression/test_issue3869.py | 29 ++++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 spacy/tests/regression/test_issue3869.py

diff --git a/spacy/tests/regression/test_issue3869.py b/spacy/tests/regression/test_issue3869.py
new file mode 100644
index 000000000..72a485042
--- /dev/null
+++ b/spacy/tests/regression/test_issue3869.py
@@ -0,0 +1,29 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import pytest
+
+from spacy.attrs import IS_ALPHA
+from spacy.lang.en import English
+
+
+@pytest.mark.parametrize(
+    "sentence",
+    [
+        'The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.',
+        'The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale\'s #1.'
+    ],
+)
+def test_issue3869(sentence):
+    """Test that the Doc's count_by function works consistently"""
+    nlp = English()
+
+    doc = nlp(sentence)
+
+    count = 0
+    for token in doc:
+        count += token.is_alpha
+
+    assert count == doc.count_by(IS_ALPHA).get(1, 0)
+
+

From e0804123854b91bbad5a3e084de867d5fbbff788 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 11 Jul 2019 01:53:06 +0200
Subject: [PATCH 2/4] tracked the bug down to PreshCounter.inc - still unclear
 what goes wrong

---
 spacy/tests/regression/test_issue3869.py |  6 ++++-
 spacy/tokens/doc.pxd                     |  1 +
 spacy/tokens/doc.pyx                     | 33 +++++++++++++++++++++++-
 3 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/spacy/tests/regression/test_issue3869.py b/spacy/tests/regression/test_issue3869.py
index 72a485042..d76da6989 100644
--- a/spacy/tests/regression/test_issue3869.py
+++ b/spacy/tests/regression/test_issue3869.py
@@ -11,13 +11,17 @@ from spacy.lang.en import English
     "sentence",
     [
         'The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.',
-        'The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale\'s #1.'
+        'The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale\'s #1.',
+        'The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale\'s number one',
+        'Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.',
+        "It was a missed assignment, but it shouldn't have resulted in a turnover ..."
     ],
 )
 def test_issue3869(sentence):
     """Test that the Doc's count_by function works consistently"""
     nlp = English()
 
+    print()
     doc = nlp(sentence)
 
     count = 0
diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd
index 7cdc2316a..cc05cb495 100644
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@@ -8,6 +8,7 @@ from ..typedefs cimport attr_t
 from ..attrs cimport attr_id_t
 
 
+
 cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil
 
 
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index c77e5c44e..657b9a1d6 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -13,6 +13,7 @@ from libc.math cimport sqrt
 import numpy
 import numpy.linalg
 import struct
+from libc.stdint cimport int64_t
 import srsly
 from thinc.neural.util import get_array_module, copy_array
 
@@ -710,22 +711,52 @@ cdef class Doc:
         cdef int i
         cdef attr_t attr
         cdef size_t count
+        cdef int64_t this_value
+
+        print("COUNTING")
 
         if counts is None:
             counts = PreshCounter()
             output_dict = True
+            print("counts None")
         else:
             output_dict = False
         # Take this check out of the loop, for a bit of extra speed
         if exclude is None:
+            print("exclude None")
             for i in range(self.length):
-                counts.inc(get_token_attr(&self.c[i], attr_id), 1)
+                print()
+                print("token", self[i])
+                this_value = get_token_attr(&self.c[i], attr_id)
+                print("token attr value", this_value)
+                print("type attr value", type(this_value))
+
+                print(i, "key this_value before", counts.c_map.cells[this_value].key)
+                print(i, "value this_value before", <int64_t>counts.c_map.cells[this_value].value)
+                counts.inc(this_value, 1)
+                print(i, "key this_value after", counts.c_map.cells[this_value].key)
+                print(i, "value this_value after", <int64_t>counts.c_map.cells[this_value].value)
+
+                print(i, "key 0", counts.c_map.cells[0].key)
+                print(i, "value 0", <int64_t>counts.c_map.cells[0].value)
+                print(i, "key 1", counts.c_map.cells[1].key)
+                print(i, "value 1", <int64_t>counts.c_map.cells[1].value)
         else:
             for i in range(self.length):
                 if not exclude(self[i]):
                     attr = get_token_attr(&self.c[i], attr_id)
                     counts.inc(attr, 1)
         if output_dict:
+            print("output_dict")
+            print(counts.length)
+            print(counts.total)
+            print("key 0", counts.c_map.cells[0].key)
+            print("value 0", <int64_t>counts.c_map.cells[0].value)
+            print("key 1", counts.c_map.cells[1].key)
+            print("value 1", <int64_t>counts.c_map.cells[1].value)
+            print()
+            print(dict(counts))
+            print()
             return dict(counts)
 
     def _realloc(self, new_size):

From 0f0f07318a9bbf37ca3f4e008c35a7c88ded777f Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 11 Jul 2019 13:05:53 +0200
Subject: [PATCH 3/4] counter instead of preshcounter

---
 bin/train_word_vectors.py |  1 -
 spacy/tokens/doc.pxd      |  1 -
 spacy/tokens/doc.pyx      | 37 +++++--------------------------------
 3 files changed, 5 insertions(+), 34 deletions(-)

diff --git a/bin/train_word_vectors.py b/bin/train_word_vectors.py
index 624e339a0..663ce060d 100644
--- a/bin/train_word_vectors.py
+++ b/bin/train_word_vectors.py
@@ -5,7 +5,6 @@ import logging
 from pathlib import Path
 from collections import defaultdict
 from gensim.models import Word2Vec
-from preshed.counter import PreshCounter
 import plac
 import spacy
 
diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd
index cc05cb495..4b8578fe0 100644
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@@ -1,6 +1,5 @@
 from cymem.cymem cimport Pool
 cimport numpy as np
-from preshed.counter cimport PreshCounter
 
 from ..vocab cimport Vocab
 from ..structs cimport TokenC, LexemeC
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 657b9a1d6..3b0c2425c 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -9,6 +9,7 @@ cimport cython
 cimport numpy as np
 from libc.string cimport memcpy, memset
 from libc.math cimport sqrt
+from collections import Counter
 
 import numpy
 import numpy.linalg
@@ -698,7 +699,7 @@ cdef class Doc:
         # Handle 1d case
         return output if len(attr_ids) >= 2 else output.reshape((self.length,))
 
-    def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None):
+    def count_by(self, attr_id_t attr_id, exclude=None, object counts=None):
         """Count the frequencies of a given attribute. Produces a dict of
         `{attribute (int): count (ints)}` frequencies, keyed by the values of
         the given attribute ID.
@@ -713,50 +714,22 @@ cdef class Doc:
         cdef size_t count
         cdef int64_t this_value
 
-        print("COUNTING")
-
         if counts is None:
-            counts = PreshCounter()
+            counts = Counter()
             output_dict = True
-            print("counts None")
         else:
             output_dict = False
         # Take this check out of the loop, for a bit of extra speed
         if exclude is None:
-            print("exclude None")
             for i in range(self.length):
-                print()
-                print("token", self[i])
                 this_value = get_token_attr(&self.c[i], attr_id)
-                print("token attr value", this_value)
-                print("type attr value", type(this_value))
-
-                print(i, "key this_value before", counts.c_map.cells[this_value].key)
-                print(i, "value this_value before", <int64_t>counts.c_map.cells[this_value].value)
-                counts.inc(this_value, 1)
-                print(i, "key this_value after", counts.c_map.cells[this_value].key)
-                print(i, "value this_value after", <int64_t>counts.c_map.cells[this_value].value)
-
-                print(i, "key 0", counts.c_map.cells[0].key)
-                print(i, "value 0", <int64_t>counts.c_map.cells[0].value)
-                print(i, "key 1", counts.c_map.cells[1].key)
-                print(i, "value 1", <int64_t>counts.c_map.cells[1].value)
+                counts[this_value] += 1
         else:
             for i in range(self.length):
                 if not exclude(self[i]):
                     attr = get_token_attr(&self.c[i], attr_id)
-                    counts.inc(attr, 1)
+                    counts[attr] += 1
         if output_dict:
-            print("output_dict")
-            print(counts.length)
-            print(counts.total)
-            print("key 0", counts.c_map.cells[0].key)
-            print("value 0", <int64_t>counts.c_map.cells[0].value)
-            print("key 1", counts.c_map.cells[1].key)
-            print("value 1", <int64_t>counts.c_map.cells[1].value)
-            print()
-            print(dict(counts))
-            print()
             return dict(counts)
 
     def _realloc(self, new_size):

From 349107daa3b0804c62861dbaa810e9a1488960b1 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 11 Jul 2019 13:09:22 +0200
Subject: [PATCH 4/4] cleanup

---
 spacy/tests/regression/test_issue3869.py | 2 --
 spacy/tokens/doc.pxd                     | 1 -
 spacy/tokens/doc.pyx                     | 8 ++------
 3 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/spacy/tests/regression/test_issue3869.py b/spacy/tests/regression/test_issue3869.py
index d76da6989..42584b133 100644
--- a/spacy/tests/regression/test_issue3869.py
+++ b/spacy/tests/regression/test_issue3869.py
@@ -20,8 +20,6 @@ from spacy.lang.en import English
 def test_issue3869(sentence):
     """Test that the Doc's count_by function works consistently"""
     nlp = English()
-
-    print()
     doc = nlp(sentence)
 
     count = 0
diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd
index 4b8578fe0..62665fcc5 100644
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@@ -7,7 +7,6 @@ from ..typedefs cimport attr_t
 from ..attrs cimport attr_id_t
 
 
-
 cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil
 
 
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 3b0c2425c..c1883f9c0 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -14,7 +14,6 @@ from collections import Counter
 import numpy
 import numpy.linalg
 import struct
-from libc.stdint cimport int64_t
 import srsly
 from thinc.neural.util import get_array_module, copy_array
 
@@ -712,7 +711,6 @@ cdef class Doc:
         cdef int i
         cdef attr_t attr
         cdef size_t count
-        cdef int64_t this_value
 
         if counts is None:
             counts = Counter()
@@ -722,13 +720,11 @@ cdef class Doc:
         # Take this check out of the loop, for a bit of extra speed
         if exclude is None:
             for i in range(self.length):
-                this_value = get_token_attr(&self.c[i], attr_id)
-                counts[this_value] += 1
+                counts[get_token_attr(&self.c[i], attr_id)] += 1
         else:
             for i in range(self.length):
                 if not exclude(self[i]):
-                    attr = get_token_attr(&self.c[i], attr_id)
-                    counts[attr] += 1
+                    counts[get_token_attr(&self.c[i], attr_id)] += 1
         if output_dict:
             return dict(counts)