From a89fecce97c06d7315bb955de1127025fa310b4b Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 11 Jul 2019 00:43:55 +0200 Subject: [PATCH 1/4] failing unit test for issue #3869 --- spacy/tests/regression/test_issue3869.py | 29 ++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 spacy/tests/regression/test_issue3869.py diff --git a/spacy/tests/regression/test_issue3869.py b/spacy/tests/regression/test_issue3869.py new file mode 100644 index 000000000..72a485042 --- /dev/null +++ b/spacy/tests/regression/test_issue3869.py @@ -0,0 +1,29 @@ +# coding: utf8 +from __future__ import unicode_literals + +import pytest + +from spacy.attrs import IS_ALPHA +from spacy.lang.en import English + + +@pytest.mark.parametrize( + "sentence", + [ + 'The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.', + 'The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale\'s #1.' + ], +) +def test_issue3869(sentence): + """Test that the Doc's count_by function works consistently""" + nlp = English() + + doc = nlp(sentence) + + count = 0 + for token in doc: + count += token.is_alpha + + assert count == doc.count_by(IS_ALPHA).get(1, 0) + + From e0804123854b91bbad5a3e084de867d5fbbff788 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 11 Jul 2019 01:53:06 +0200 Subject: [PATCH 2/4] tracked the bug down to PreshCounter.inc - still unclear what goes wrong --- spacy/tests/regression/test_issue3869.py | 6 ++++- spacy/tokens/doc.pxd | 1 + spacy/tokens/doc.pyx | 33 +++++++++++++++++++++++- 3 files changed, 38 insertions(+), 2 deletions(-) diff --git a/spacy/tests/regression/test_issue3869.py b/spacy/tests/regression/test_issue3869.py index 72a485042..d76da6989 100644 --- a/spacy/tests/regression/test_issue3869.py +++ b/spacy/tests/regression/test_issue3869.py @@ -11,13 +11,17 @@ from spacy.lang.en import English "sentence", [ 'The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.', - 'The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale\'s #1.' + 'The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale\'s #1.', + 'The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale\'s number one', + 'Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.', + "It was a missed assignment, but it shouldn't have resulted in a turnover ..." ], ) def test_issue3869(sentence): """Test that the Doc's count_by function works consistently""" nlp = English() + print() doc = nlp(sentence) count = 0 diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index 7cdc2316a..cc05cb495 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -8,6 +8,7 @@ from ..typedefs cimport attr_t from ..attrs cimport attr_id_t + cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index c77e5c44e..657b9a1d6 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -13,6 +13,7 @@ from libc.math cimport sqrt import numpy import numpy.linalg import struct +from libc.stdint cimport int64_t import srsly from thinc.neural.util import get_array_module, copy_array @@ -710,22 +711,52 @@ cdef class Doc: cdef int i cdef attr_t attr cdef size_t count + cdef int64_t this_value + + print("COUNTING") if counts is None: counts = PreshCounter() output_dict = True + print("counts None") else: output_dict = False # Take this check out of the loop, for a bit of extra speed if exclude is None: + print("exclude None") for i in range(self.length): - counts.inc(get_token_attr(&self.c[i], attr_id), 1) + print() + print("token", self[i]) + this_value = get_token_attr(&self.c[i], attr_id) + print("token attr value", this_value) + print("type attr value", type(this_value)) + + print(i, "key this_value before", counts.c_map.cells[this_value].key) + print(i, "value this_value before", counts.c_map.cells[this_value].value) + counts.inc(this_value, 1) + print(i, "key this_value after", counts.c_map.cells[this_value].key) + print(i, "value this_value after", counts.c_map.cells[this_value].value) + + print(i, "key 0", counts.c_map.cells[0].key) + print(i, "value 0", counts.c_map.cells[0].value) + print(i, "key 1", counts.c_map.cells[1].key) + print(i, "value 1", counts.c_map.cells[1].value) else: for i in range(self.length): if not exclude(self[i]): attr = get_token_attr(&self.c[i], attr_id) counts.inc(attr, 1) if output_dict: + print("output_dict") + print(counts.length) + print(counts.total) + print("key 0", counts.c_map.cells[0].key) + print("value 0", counts.c_map.cells[0].value) + print("key 1", counts.c_map.cells[1].key) + print("value 1", counts.c_map.cells[1].value) + print() + print(dict(counts)) + print() return dict(counts) def _realloc(self, new_size): From 0f0f07318a9bbf37ca3f4e008c35a7c88ded777f Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 11 Jul 2019 13:05:53 +0200 Subject: [PATCH 3/4] counter instead of preshcounter --- bin/train_word_vectors.py | 1 - spacy/tokens/doc.pxd | 1 - spacy/tokens/doc.pyx | 37 +++++-------------------------------- 3 files changed, 5 insertions(+), 34 deletions(-) diff --git a/bin/train_word_vectors.py b/bin/train_word_vectors.py index 624e339a0..663ce060d 100644 --- a/bin/train_word_vectors.py +++ b/bin/train_word_vectors.py @@ -5,7 +5,6 @@ import logging from pathlib import Path from collections import defaultdict from gensim.models import Word2Vec -from preshed.counter import PreshCounter import plac import spacy diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index cc05cb495..4b8578fe0 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -1,6 +1,5 @@ from cymem.cymem cimport Pool cimport numpy as np -from preshed.counter cimport PreshCounter from ..vocab cimport Vocab from ..structs cimport TokenC, LexemeC diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 657b9a1d6..3b0c2425c 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -9,6 +9,7 @@ cimport cython cimport numpy as np from libc.string cimport memcpy, memset from libc.math cimport sqrt +from collections import Counter import numpy import numpy.linalg @@ -698,7 +699,7 @@ cdef class Doc: # Handle 1d case return output if len(attr_ids) >= 2 else output.reshape((self.length,)) - def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None): + def count_by(self, attr_id_t attr_id, exclude=None, object counts=None): """Count the frequencies of a given attribute. Produces a dict of `{attribute (int): count (ints)}` frequencies, keyed by the values of the given attribute ID. @@ -713,50 +714,22 @@ cdef class Doc: cdef size_t count cdef int64_t this_value - print("COUNTING") - if counts is None: - counts = PreshCounter() + counts = Counter() output_dict = True - print("counts None") else: output_dict = False # Take this check out of the loop, for a bit of extra speed if exclude is None: - print("exclude None") for i in range(self.length): - print() - print("token", self[i]) this_value = get_token_attr(&self.c[i], attr_id) - print("token attr value", this_value) - print("type attr value", type(this_value)) - - print(i, "key this_value before", counts.c_map.cells[this_value].key) - print(i, "value this_value before", counts.c_map.cells[this_value].value) - counts.inc(this_value, 1) - print(i, "key this_value after", counts.c_map.cells[this_value].key) - print(i, "value this_value after", counts.c_map.cells[this_value].value) - - print(i, "key 0", counts.c_map.cells[0].key) - print(i, "value 0", counts.c_map.cells[0].value) - print(i, "key 1", counts.c_map.cells[1].key) - print(i, "value 1", counts.c_map.cells[1].value) + counts[this_value] += 1 else: for i in range(self.length): if not exclude(self[i]): attr = get_token_attr(&self.c[i], attr_id) - counts.inc(attr, 1) + counts[attr] += 1 if output_dict: - print("output_dict") - print(counts.length) - print(counts.total) - print("key 0", counts.c_map.cells[0].key) - print("value 0", counts.c_map.cells[0].value) - print("key 1", counts.c_map.cells[1].key) - print("value 1", counts.c_map.cells[1].value) - print() - print(dict(counts)) - print() return dict(counts) def _realloc(self, new_size): From 349107daa3b0804c62861dbaa810e9a1488960b1 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 11 Jul 2019 13:09:22 +0200 Subject: [PATCH 4/4] cleanup --- spacy/tests/regression/test_issue3869.py | 2 -- spacy/tokens/doc.pxd | 1 - spacy/tokens/doc.pyx | 8 ++------ 3 files changed, 2 insertions(+), 9 deletions(-) diff --git a/spacy/tests/regression/test_issue3869.py b/spacy/tests/regression/test_issue3869.py index d76da6989..42584b133 100644 --- a/spacy/tests/regression/test_issue3869.py +++ b/spacy/tests/regression/test_issue3869.py @@ -20,8 +20,6 @@ from spacy.lang.en import English def test_issue3869(sentence): """Test that the Doc's count_by function works consistently""" nlp = English() - - print() doc = nlp(sentence) count = 0 diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index 4b8578fe0..62665fcc5 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -7,7 +7,6 @@ from ..typedefs cimport attr_t from ..attrs cimport attr_id_t - cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 3b0c2425c..c1883f9c0 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -14,7 +14,6 @@ from collections import Counter import numpy import numpy.linalg import struct -from libc.stdint cimport int64_t import srsly from thinc.neural.util import get_array_module, copy_array @@ -712,7 +711,6 @@ cdef class Doc: cdef int i cdef attr_t attr cdef size_t count - cdef int64_t this_value if counts is None: counts = Counter() @@ -722,13 +720,11 @@ cdef class Doc: # Take this check out of the loop, for a bit of extra speed if exclude is None: for i in range(self.length): - this_value = get_token_attr(&self.c[i], attr_id) - counts[this_value] += 1 + counts[get_token_attr(&self.c[i], attr_id)] += 1 else: for i in range(self.length): if not exclude(self[i]): - attr = get_token_attr(&self.c[i], attr_id) - counts[attr] += 1 + counts[get_token_attr(&self.c[i], attr_id)] += 1 if output_dict: return dict(counts)