counter instead of preshcounter

This commit is contained in:
svlandeg 2019-07-11 13:05:53 +02:00
parent e080412385
commit 0f0f07318a
3 changed files with 5 additions and 34 deletions

View File

@ -5,7 +5,6 @@ import logging
from pathlib import Path from pathlib import Path
from collections import defaultdict from collections import defaultdict
from gensim.models import Word2Vec from gensim.models import Word2Vec
from preshed.counter import PreshCounter
import plac import plac
import spacy import spacy

View File

@ -1,6 +1,5 @@
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
cimport numpy as np cimport numpy as np
from preshed.counter cimport PreshCounter
from ..vocab cimport Vocab from ..vocab cimport Vocab
from ..structs cimport TokenC, LexemeC from ..structs cimport TokenC, LexemeC

View File

@ -9,6 +9,7 @@ cimport cython
cimport numpy as np cimport numpy as np
from libc.string cimport memcpy, memset from libc.string cimport memcpy, memset
from libc.math cimport sqrt from libc.math cimport sqrt
from collections import Counter
import numpy import numpy
import numpy.linalg import numpy.linalg
@ -698,7 +699,7 @@ cdef class Doc:
# Handle 1d case # Handle 1d case
return output if len(attr_ids) >= 2 else output.reshape((self.length,)) return output if len(attr_ids) >= 2 else output.reshape((self.length,))
def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None): def count_by(self, attr_id_t attr_id, exclude=None, object counts=None):
"""Count the frequencies of a given attribute. Produces a dict of """Count the frequencies of a given attribute. Produces a dict of
`{attribute (int): count (ints)}` frequencies, keyed by the values of `{attribute (int): count (ints)}` frequencies, keyed by the values of
the given attribute ID. the given attribute ID.
@ -713,50 +714,22 @@ cdef class Doc:
cdef size_t count cdef size_t count
cdef int64_t this_value cdef int64_t this_value
print("COUNTING")
if counts is None: if counts is None:
counts = PreshCounter() counts = Counter()
output_dict = True output_dict = True
print("counts None")
else: else:
output_dict = False output_dict = False
# Take this check out of the loop, for a bit of extra speed # Take this check out of the loop, for a bit of extra speed
if exclude is None: if exclude is None:
print("exclude None")
for i in range(self.length): for i in range(self.length):
print()
print("token", self[i])
this_value = get_token_attr(&self.c[i], attr_id) this_value = get_token_attr(&self.c[i], attr_id)
print("token attr value", this_value) counts[this_value] += 1
print("type attr value", type(this_value))
print(i, "key this_value before", counts.c_map.cells[this_value].key)
print(i, "value this_value before", <int64_t>counts.c_map.cells[this_value].value)
counts.inc(this_value, 1)
print(i, "key this_value after", counts.c_map.cells[this_value].key)
print(i, "value this_value after", <int64_t>counts.c_map.cells[this_value].value)
print(i, "key 0", counts.c_map.cells[0].key)
print(i, "value 0", <int64_t>counts.c_map.cells[0].value)
print(i, "key 1", counts.c_map.cells[1].key)
print(i, "value 1", <int64_t>counts.c_map.cells[1].value)
else: else:
for i in range(self.length): for i in range(self.length):
if not exclude(self[i]): if not exclude(self[i]):
attr = get_token_attr(&self.c[i], attr_id) attr = get_token_attr(&self.c[i], attr_id)
counts.inc(attr, 1) counts[attr] += 1
if output_dict: if output_dict:
print("output_dict")
print(counts.length)
print(counts.total)
print("key 0", counts.c_map.cells[0].key)
print("value 0", <int64_t>counts.c_map.cells[0].value)
print("key 1", counts.c_map.cells[1].key)
print("value 1", <int64_t>counts.c_map.cells[1].value)
print()
print(dict(counts))
print()
return dict(counts) return dict(counts)
def _realloc(self, new_size): def _realloc(self, new_size):