diff --git a/bin/train_word_vectors.py b/bin/train_word_vectors.py index 624e339a0..663ce060d 100644 --- a/bin/train_word_vectors.py +++ b/bin/train_word_vectors.py @@ -5,7 +5,6 @@ import logging from pathlib import Path from collections import defaultdict from gensim.models import Word2Vec -from preshed.counter import PreshCounter import plac import spacy diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index cc05cb495..4b8578fe0 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -1,6 +1,5 @@ from cymem.cymem cimport Pool cimport numpy as np -from preshed.counter cimport PreshCounter from ..vocab cimport Vocab from ..structs cimport TokenC, LexemeC diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 657b9a1d6..3b0c2425c 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -9,6 +9,7 @@ cimport cython cimport numpy as np from libc.string cimport memcpy, memset from libc.math cimport sqrt +from collections import Counter import numpy import numpy.linalg @@ -698,7 +699,7 @@ cdef class Doc: # Handle 1d case return output if len(attr_ids) >= 2 else output.reshape((self.length,)) - def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None): + def count_by(self, attr_id_t attr_id, exclude=None, object counts=None): """Count the frequencies of a given attribute. Produces a dict of `{attribute (int): count (ints)}` frequencies, keyed by the values of the given attribute ID. @@ -713,50 +714,22 @@ cdef class Doc: cdef size_t count cdef int64_t this_value - print("COUNTING") - if counts is None: - counts = PreshCounter() + counts = Counter() output_dict = True - print("counts None") else: output_dict = False # Take this check out of the loop, for a bit of extra speed if exclude is None: - print("exclude None") for i in range(self.length): - print() - print("token", self[i]) this_value = get_token_attr(&self.c[i], attr_id) - print("token attr value", this_value) - print("type attr value", type(this_value)) - - print(i, "key this_value before", counts.c_map.cells[this_value].key) - print(i, "value this_value before", counts.c_map.cells[this_value].value) - counts.inc(this_value, 1) - print(i, "key this_value after", counts.c_map.cells[this_value].key) - print(i, "value this_value after", counts.c_map.cells[this_value].value) - - print(i, "key 0", counts.c_map.cells[0].key) - print(i, "value 0", counts.c_map.cells[0].value) - print(i, "key 1", counts.c_map.cells[1].key) - print(i, "value 1", counts.c_map.cells[1].value) + counts[this_value] += 1 else: for i in range(self.length): if not exclude(self[i]): attr = get_token_attr(&self.c[i], attr_id) - counts.inc(attr, 1) + counts[attr] += 1 if output_dict: - print("output_dict") - print(counts.length) - print(counts.total) - print("key 0", counts.c_map.cells[0].key) - print("value 0", counts.c_map.cells[0].value) - print("key 1", counts.c_map.cells[1].key) - print("value 1", counts.c_map.cells[1].value) - print() - print(dict(counts)) - print() return dict(counts) def _realloc(self, new_size):