2021-01-29 07:57:04 +03:00
from typing import Union , Dict , Optional , Any , IO , TYPE_CHECKING
2020-09-28 16:09:59 +03:00
from thinc . api import Config , fix_random_seed , set_gpu_allocator
from thinc . api import ConfigValidationError
from pathlib import Path
import srsly
2020-09-29 11:58:50 +03:00
import numpy
import tarfile
import gzip
import zipfile
import tqdm
2021-04-08 11:08:04 +03:00
from itertools import islice
2020-09-28 16:09:59 +03:00
2021-03-09 06:01:13 +03:00
from . pretrain import get_tok2vec_ref
2020-09-28 16:09:59 +03:00
from . . lookups import Lookups
2020-09-29 11:58:50 +03:00
from . . vectors import Vectors
2021-01-26 06:51:52 +03:00
from . . errors import Errors , Warnings
2021-01-14 18:57:57 +03:00
from . . schemas import ConfigSchemaTraining
2020-09-29 17:08:39 +03:00
from . . util import registry , load_model_from_config , resolve_dot_names , logger
2021-01-29 11:37:04 +03:00
from . . util import load_model , ensure_path , get_sourced_components
from . . util import OOV_RANK , DEFAULT_OOV_PROB
2020-09-28 16:09:59 +03:00
2020-09-29 17:05:48 +03:00
if TYPE_CHECKING :
from . . language import Language # noqa: F401
2020-09-28 16:09:59 +03:00
2020-09-29 17:05:48 +03:00
2020-09-29 17:08:39 +03:00
def init_nlp ( config : Config , * , use_gpu : int = - 1 ) - > " Language " :
2020-09-28 16:09:59 +03:00
raw_config = config
config = raw_config . interpolate ( )
2021-01-14 18:57:57 +03:00
if " seed " not in config [ " training " ] :
raise ValueError ( Errors . E1015 . format ( value = " [training] seed " ) )
if " gpu_allocator " not in config [ " training " ] :
raise ValueError ( Errors . E1015 . format ( value = " [training] gpu_allocator " ) )
2020-09-28 16:09:59 +03:00
if config [ " training " ] [ " seed " ] is not None :
fix_random_seed ( config [ " training " ] [ " seed " ] )
allocator = config [ " training " ] [ " gpu_allocator " ]
if use_gpu > = 0 and allocator :
set_gpu_allocator ( allocator )
# Use original config here before it's resolved to functions
2021-01-29 07:57:04 +03:00
sourced = get_sourced_components ( config )
2020-09-28 16:09:59 +03:00
nlp = load_model_from_config ( raw_config , auto_fill = True )
2020-09-29 17:08:39 +03:00
logger . info ( " Set up nlp object from config " )
2020-09-28 16:09:59 +03:00
config = nlp . config . interpolate ( )
# Resolve all training-relevant sections using the filled nlp config
T = registry . resolve ( config [ " training " ] , schema = ConfigSchemaTraining )
dot_names = [ T [ " train_corpus " ] , T [ " dev_corpus " ] ]
2020-10-18 15:50:41 +03:00
if not isinstance ( T [ " train_corpus " ] , str ) :
2021-01-05 05:41:53 +03:00
raise ConfigValidationError (
desc = Errors . E897 . format (
field = " training.train_corpus " , type = type ( T [ " train_corpus " ] )
)
)
2020-10-18 15:50:41 +03:00
if not isinstance ( T [ " dev_corpus " ] , str ) :
2021-01-05 05:41:53 +03:00
raise ConfigValidationError (
desc = Errors . E897 . format (
field = " training.dev_corpus " , type = type ( T [ " dev_corpus " ] )
)
)
2020-09-28 16:09:59 +03:00
train_corpus , dev_corpus = resolve_dot_names ( config , dot_names )
optimizer = T [ " optimizer " ]
# Components that shouldn't be updated during training
frozen_components = T [ " frozen_components " ]
# Sourced components that require resume_training
2021-01-29 07:57:04 +03:00
resume_components = [ p for p in sourced if p not in frozen_components ]
2020-09-29 17:08:39 +03:00
logger . info ( f " Pipeline: { nlp . pipe_names } " )
2020-09-28 16:09:59 +03:00
if resume_components :
with nlp . select_pipes ( enable = resume_components ) :
2020-09-29 17:08:39 +03:00
logger . info ( f " Resuming training for: { resume_components } " )
2020-09-28 16:09:59 +03:00
nlp . resume_training ( sgd = optimizer )
2021-02-01 14:19:58 +03:00
# Make sure that listeners are defined before initializing further
nlp . _link_components ( )
2020-09-28 16:09:59 +03:00
with nlp . select_pipes ( disable = [ * frozen_components , * resume_components ] ) :
2021-04-08 11:08:04 +03:00
if T [ " max_epochs " ] == - 1 :
logger . debug ( " Due to streamed train corpus, using only first 100 examples for initialization. If necessary, provide all labels in [initialize]. More info: https://spacy.io/api/cli#init_labels " )
nlp . initialize ( lambda : islice ( train_corpus ( nlp ) , 100 ) , sgd = optimizer )
else :
nlp . initialize ( lambda : train_corpus ( nlp ) , sgd = optimizer )
2020-10-05 15:59:13 +03:00
logger . info ( f " Initialized pipeline components: { nlp . pipe_names } " )
2021-01-20 03:12:35 +03:00
# Detect components with listeners that are not frozen consistently
for name , proc in nlp . pipeline :
2021-03-17 16:41:41 +03:00
for listener in getattr ( proc , " listening_components " , [ ] ) : # e.g. tok2vec/transformer
# Don't warn about components not in the pipeline
if listener not in nlp . pipe_names :
continue
2021-03-17 08:56:04 +03:00
2021-03-17 16:41:41 +03:00
if listener in frozen_components and name not in frozen_components :
logger . warning ( Warnings . W087 . format ( name = name , listener = listener ) )
# We always check this regardless, in case user freezes tok2vec
if listener not in frozen_components and name in frozen_components :
logger . warning ( Warnings . W086 . format ( name = name , listener = listener ) )
2020-09-28 16:09:59 +03:00
return nlp
def init_vocab (
2020-09-29 17:05:48 +03:00
nlp : " Language " ,
2020-09-28 16:09:59 +03:00
* ,
data : Optional [ Path ] = None ,
lookups : Optional [ Lookups ] = None ,
vectors : Optional [ str ] = None ,
2020-09-29 17:05:48 +03:00
) - > " Language " :
2020-09-28 16:09:59 +03:00
if lookups :
nlp . vocab . lookups = lookups
2020-09-29 17:08:39 +03:00
logger . info ( f " Added vocab lookups: { ' , ' . join ( lookups . tables ) } " )
2020-09-28 16:09:59 +03:00
data_path = ensure_path ( data )
if data_path is not None :
lex_attrs = srsly . read_jsonl ( data_path )
for lexeme in nlp . vocab :
lexeme . rank = OOV_RANK
for attrs in lex_attrs :
if " settings " in attrs :
continue
lexeme = nlp . vocab [ attrs [ " orth " ] ]
lexeme . set_attrs ( * * attrs )
if len ( nlp . vocab ) :
oov_prob = min ( lex . prob for lex in nlp . vocab ) - 1
else :
oov_prob = DEFAULT_OOV_PROB
nlp . vocab . cfg . update ( { " oov_prob " : oov_prob } )
2020-09-29 17:22:41 +03:00
logger . info ( f " Added { len ( nlp . vocab ) } lexical entries to the vocab " )
logger . info ( " Created vocabulary " )
2020-09-28 16:09:59 +03:00
if vectors is not None :
load_vectors_into_model ( nlp , vectors )
2020-09-29 17:22:41 +03:00
logger . info ( f " Added vectors: { vectors } " )
2020-09-29 23:53:14 +03:00
logger . info ( " Finished initializing nlp object " )
2020-09-28 16:09:59 +03:00
def load_vectors_into_model (
2020-09-29 17:05:48 +03:00
nlp : " Language " , name : Union [ str , Path ] , * , add_strings : bool = True
2020-09-28 16:09:59 +03:00
) - > None :
""" Load word vectors from an installed model or path into a model instance. """
try :
vectors_nlp = load_model ( name )
except ConfigValidationError as e :
title = f " Config validation error for vectors { name } "
desc = (
" This typically means that there ' s a problem in the config.cfg included "
" with the packaged vectors. Make sure that the vectors package you ' re "
" loading is compatible with the current version of spaCy. "
)
2020-12-09 01:16:07 +03:00
err = ConfigValidationError . from_error ( e , title = title , desc = desc )
2020-09-28 16:09:59 +03:00
raise err from None
2021-04-04 21:20:24 +03:00
if len ( vectors_nlp . vocab . vectors . keys ( ) ) == 0 :
logger . warning ( Warnings . W112 . format ( name = name ) )
2020-09-28 16:09:59 +03:00
nlp . vocab . vectors = vectors_nlp . vocab . vectors
if add_strings :
# I guess we should add the strings from the vectors_nlp model?
# E.g. if someone does a similarity query, they might expect the strings.
for key in nlp . vocab . vectors . key2row :
if key in vectors_nlp . vocab . strings :
nlp . vocab . strings . add ( vectors_nlp . vocab . strings [ key ] )
2020-09-29 17:05:48 +03:00
def init_tok2vec (
2020-09-29 17:47:55 +03:00
nlp : " Language " , pretrain_config : Dict [ str , Any ] , init_config : Dict [ str , Any ]
2020-09-28 16:09:59 +03:00
) - > bool :
# Load pretrained tok2vec weights - cf. CLI command 'pretrain'
P = pretrain_config
2020-09-29 17:47:55 +03:00
I = init_config
2020-09-28 16:09:59 +03:00
weights_data = None
2020-09-29 17:47:55 +03:00
init_tok2vec = ensure_path ( I [ " init_tok2vec " ] )
2020-09-28 16:09:59 +03:00
if init_tok2vec is not None :
if not init_tok2vec . exists ( ) :
err = f " can ' t find pretrained tok2vec: { init_tok2vec } "
2020-09-29 21:38:35 +03:00
errors = [ { " loc " : [ " initialize " , " init_tok2vec " ] , " msg " : err } ]
2020-09-28 16:09:59 +03:00
raise ConfigValidationError ( config = nlp . config , errors = errors )
with init_tok2vec . open ( " rb " ) as file_ :
weights_data = file_ . read ( )
if weights_data is not None :
2021-03-09 06:01:13 +03:00
layer = get_tok2vec_ref ( nlp , P )
2020-09-28 16:09:59 +03:00
layer . from_bytes ( weights_data )
2021-03-09 06:01:13 +03:00
logger . info ( f " Loaded pretrained weights from { init_tok2vec } " )
2020-09-28 16:09:59 +03:00
return True
return False
2020-09-29 11:58:50 +03:00
def convert_vectors (
2020-09-29 17:05:48 +03:00
nlp : " Language " ,
2020-09-29 11:58:50 +03:00
vectors_loc : Optional [ Path ] ,
* ,
truncate : int ,
prune : int ,
name : Optional [ str ] = None ,
) - > None :
vectors_loc = ensure_path ( vectors_loc )
if vectors_loc and vectors_loc . parts [ - 1 ] . endswith ( " .npz " ) :
nlp . vocab . vectors = Vectors ( data = numpy . load ( vectors_loc . open ( " rb " ) ) )
for lex in nlp . vocab :
if lex . rank and lex . rank != OOV_RANK :
nlp . vocab . vectors . add ( lex . orth , row = lex . rank )
else :
if vectors_loc :
2020-09-29 17:08:39 +03:00
logger . info ( f " Reading vectors from { vectors_loc } " )
vectors_data , vector_keys = read_vectors ( vectors_loc , truncate )
logger . info ( f " Loaded vectors from { vectors_loc } " )
2020-09-29 11:58:50 +03:00
else :
vectors_data , vector_keys = ( None , None )
if vector_keys is not None :
for word in vector_keys :
if word not in nlp . vocab :
nlp . vocab [ word ]
if vectors_data is not None :
nlp . vocab . vectors = Vectors ( data = vectors_data , keys = vector_keys )
if name is None :
# TODO: Is this correct? Does this matter?
nlp . vocab . vectors . name = f " { nlp . meta [ ' lang ' ] } _ { nlp . meta [ ' name ' ] } .vectors "
else :
nlp . vocab . vectors . name = name
nlp . meta [ " vectors " ] [ " name " ] = nlp . vocab . vectors . name
if prune > = 1 :
nlp . vocab . prune_vectors ( prune )
def read_vectors ( vectors_loc : Path , truncate_vectors : int ) :
2021-02-07 03:05:43 +03:00
f = ensure_shape ( vectors_loc )
2020-09-29 11:58:50 +03:00
shape = tuple ( int ( size ) for size in next ( f ) . split ( ) )
if truncate_vectors > = 1 :
shape = ( truncate_vectors , shape [ 1 ] )
vectors_data = numpy . zeros ( shape = shape , dtype = " f " )
vectors_keys = [ ]
for i , line in enumerate ( tqdm . tqdm ( f ) ) :
line = line . rstrip ( )
pieces = line . rsplit ( " " , vectors_data . shape [ 1 ] )
word = pieces . pop ( 0 )
if len ( pieces ) != vectors_data . shape [ 1 ] :
raise ValueError ( Errors . E094 . format ( line_num = i , loc = vectors_loc ) )
vectors_data [ i ] = numpy . asarray ( pieces , dtype = " f " )
vectors_keys . append ( word )
if i == truncate_vectors - 1 :
break
return vectors_data , vectors_keys
def open_file ( loc : Union [ str , Path ] ) - > IO :
""" Handle .gz, .tar.gz or unzipped files """
loc = ensure_path ( loc )
if tarfile . is_tarfile ( str ( loc ) ) :
return tarfile . open ( str ( loc ) , " r:gz " )
elif loc . parts [ - 1 ] . endswith ( " gz " ) :
return ( line . decode ( " utf8 " ) for line in gzip . open ( str ( loc ) , " r " ) )
elif loc . parts [ - 1 ] . endswith ( " zip " ) :
zip_file = zipfile . ZipFile ( str ( loc ) )
names = zip_file . namelist ( )
file_ = zip_file . open ( names [ 0 ] )
return ( line . decode ( " utf8 " ) for line in file_ )
else :
return loc . open ( " r " , encoding = " utf8 " )
2021-02-07 03:05:43 +03:00
def ensure_shape ( vectors_loc ) :
2020-09-29 11:58:50 +03:00
""" Ensure that the first line of the data is the vectors shape.
If it ' s not, we read in the data and output the shape as the first result,
so that the reader doesn ' t have to deal with the problem.
"""
2021-02-07 03:05:43 +03:00
lines = open_file ( vectors_loc )
2020-09-29 11:58:50 +03:00
first_line = next ( lines )
try :
shape = tuple ( int ( size ) for size in first_line . split ( ) )
except ValueError :
shape = None
if shape is not None :
# All good, give the data
yield first_line
yield from lines
else :
# Figure out the shape, make it the first value, and then give the
# rest of the data.
width = len ( first_line . split ( ) ) - 1
2021-02-07 03:05:43 +03:00
length = 1
for _ in lines :
length + = 1
2020-09-29 11:58:50 +03:00
yield f " { length } { width } "
2021-02-07 03:05:43 +03:00
# Reading the lines in again from file. This to avoid having to
# store all the results in a list in memory
lines2 = open_file ( vectors_loc )
yield from lines2