mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
reduce memory load when reading all vectors from file (#6945)
* reduce memory load when reading all vectors from file * one more small typo fix
This commit is contained in:
parent
a323ef90df
commit
6ed423c16c
|
@ -451,7 +451,7 @@ cdef class Lexeme:
|
||||||
Lexeme.c_set_flag(self.c, IS_QUOTE, x)
|
Lexeme.c_set_flag(self.c, IS_QUOTE, x)
|
||||||
|
|
||||||
property is_left_punct:
|
property is_left_punct:
|
||||||
"""RETURNS (bool): Whether the lexeme is left punctuation, e.g. )."""
|
"""RETURNS (bool): Whether the lexeme is left punctuation, e.g. (."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT)
|
return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT)
|
||||||
|
|
||||||
|
|
|
@ -215,8 +215,7 @@ def convert_vectors(
|
||||||
|
|
||||||
|
|
||||||
def read_vectors(vectors_loc: Path, truncate_vectors: int):
|
def read_vectors(vectors_loc: Path, truncate_vectors: int):
|
||||||
f = open_file(vectors_loc)
|
f = ensure_shape(vectors_loc)
|
||||||
f = ensure_shape(f)
|
|
||||||
shape = tuple(int(size) for size in next(f).split())
|
shape = tuple(int(size) for size in next(f).split())
|
||||||
if truncate_vectors >= 1:
|
if truncate_vectors >= 1:
|
||||||
shape = (truncate_vectors, shape[1])
|
shape = (truncate_vectors, shape[1])
|
||||||
|
@ -251,11 +250,12 @@ def open_file(loc: Union[str, Path]) -> IO:
|
||||||
return loc.open("r", encoding="utf8")
|
return loc.open("r", encoding="utf8")
|
||||||
|
|
||||||
|
|
||||||
def ensure_shape(lines):
|
def ensure_shape(vectors_loc):
|
||||||
"""Ensure that the first line of the data is the vectors shape.
|
"""Ensure that the first line of the data is the vectors shape.
|
||||||
If it's not, we read in the data and output the shape as the first result,
|
If it's not, we read in the data and output the shape as the first result,
|
||||||
so that the reader doesn't have to deal with the problem.
|
so that the reader doesn't have to deal with the problem.
|
||||||
"""
|
"""
|
||||||
|
lines = open_file(vectors_loc)
|
||||||
first_line = next(lines)
|
first_line = next(lines)
|
||||||
try:
|
try:
|
||||||
shape = tuple(int(size) for size in first_line.split())
|
shape = tuple(int(size) for size in first_line.split())
|
||||||
|
@ -269,7 +269,11 @@ def ensure_shape(lines):
|
||||||
# Figure out the shape, make it the first value, and then give the
|
# Figure out the shape, make it the first value, and then give the
|
||||||
# rest of the data.
|
# rest of the data.
|
||||||
width = len(first_line.split()) - 1
|
width = len(first_line.split()) - 1
|
||||||
captured = [first_line] + list(lines)
|
length = 1
|
||||||
length = len(captured)
|
for _ in lines:
|
||||||
|
length += 1
|
||||||
yield f"{length} {width}"
|
yield f"{length} {width}"
|
||||||
yield from captured
|
# Reading the lines in again from file. This to avoid having to
|
||||||
|
# store all the results in a list in memory
|
||||||
|
lines2 = open_file(vectors_loc)
|
||||||
|
yield from lines2
|
||||||
|
|
|
@ -727,7 +727,7 @@ capitalization by including a mix of capitalized and lowercase examples. See the
|
||||||
|
|
||||||
Create a data augmentation callback that uses orth-variant replacement. The
|
Create a data augmentation callback that uses orth-variant replacement. The
|
||||||
callback can be added to a corpus or other data iterator during training. It's
|
callback can be added to a corpus or other data iterator during training. It's
|
||||||
is especially useful for punctuation and case replacement, to help generalize
|
especially useful for punctuation and case replacement, to help generalize
|
||||||
beyond corpora that don't have smart quotes, or only have smart quotes etc.
|
beyond corpora that don't have smart quotes, or only have smart quotes etc.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
|
|
Loading…
Reference in New Issue
Block a user