first commit

This commit is contained in:
Sergey Volnov 2023-03-26 14:38:57 +03:00
parent de9cfea142
commit 8ccbb4c062
5 changed files with 65869 additions and 0 deletions

266
inference_model.py Normal file
View File

@ -0,0 +1,266 @@
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import time
from datetime import datetime, timedelta
import catboost
import pickle
from sklearn.metrics import mean_absolute_error
from sklearn.base import BaseEstimator, TransformerMixin
import calendar
from datetime import date
import warnings
warnings.filterwarnings("ignore")
# Creating sales lag features
def create_sales_lag_feats(df, gpby_cols, target_col, lags):
gpby = df.groupby(gpby_cols)
for i in lags:
df['_'.join([target_col, 'lag', str(i)])] = \
gpby[target_col].shift(i).values + np.random.normal(scale=1, size=(len(df),)) * 0
return df
# Creating sales rolling mean features
def create_sales_rmean_feats(df, gpby_cols, target_col, windows, min_periods=2,
shift=1, win_type=None):
gpby = df.groupby(gpby_cols)
for w in windows:
df['_'.join([target_col, 'rmean', str(w)])] = \
gpby[target_col].shift(shift).rolling(window=w,
min_periods=min_periods,
win_type=win_type).mean().values +\
np.random.normal(scale=1, size=(len(df),)) * 0
return df
# Creating sales rolling median features
def create_sales_rmed_feats(df, gpby_cols, target_col, windows, min_periods=2,
shift=1, win_type=None):
gpby = df.groupby(gpby_cols)
for w in windows:
df['_'.join([target_col, 'rmed', str(w)])] = \
gpby[target_col].shift(shift).rolling(window=w,
min_periods=min_periods,
win_type=win_type).median().values +\
np.random.normal(scale=1, size=(len(df),)) * 0
return df
# Creating sales exponentially weighted mean features
def create_sales_ewm_feats(df, gpby_cols, target_col, alpha=[0.9], shift=[1]):
gpby = df.groupby(gpby_cols)
for a in alpha:
for s in shift:
df['_'.join([target_col, 'lag', str(s), 'ewm', str(a)])] = \
gpby[target_col].shift(s).ewm(alpha=a).mean().values
return df
def add_months(sourcedate, months):
month = sourcedate.month - 1 + months
year = sourcedate.year + month // 12
month = month % 12 + 1
day = min(sourcedate.day, calendar.monthrange(year,month)[1])
return date(year, month, day)
class Preprocesser(BaseEstimator, TransformerMixin):
def __init__(self, products_info_path, store_info_path):
self.hash_to_numbers = {}
self.numbers_to_hash = {}
self.drop_cols = ['date', 'sales', 'year', 'product_name', 'month']
self.products_info_path = products_info_path
self.store_info_path = store_info_path
self.cat_cols = ['store', 'item'] + \
['inn', 'product_short_name', 'tnved', 'tnved10', 'brand',
'country', 'region_code', 'city_with_type', 'city_fias_id', 'postal_code']
self.mean_values = {}
self.min_date = '2021-11-01'
def fit(self, data):
data = data.copy()
data = data.rename(columns={'dt': 'date', 'gtin': 'item', 'id_sp_': 'store', 'cnt': 'sales'})
data = data.drop(columns='inn', axis=1)
data = data.dropna(subset='store')
for col in ['item', 'store', 'prid']:
self.hash_to_numbers[col] = {a: b for a, b in zip(np.unique(data[col]), np.arange(data[col].nunique()))}
self.numbers_to_hash[col] = {b: a for a, b in zip(np.unique(data[col]), np.arange(data[col].nunique()))}
return self
def get_df(self, data):
data = data.sort_values(by='dt').reset_index(drop=True)
data = data.rename(columns={'dt': 'date', 'gtin': 'item', 'id_sp_': 'store', 'cnt': 'sales'})
data = data.drop(columns='inn', axis=1)
data = data.dropna(subset='store')
for col in ['item', 'store', 'prid']:
data.loc[:, col] = data[col].apply(lambda x: self.hash_to_numbers[col][x] if x in self.hash_to_numbers[col] else np.nan)
data.loc[:, col] = data[col].astype('int')
data['date'] = data['date'].apply(lambda x: x[:-2] + '01')
df = data.groupby(['store', 'item', 'date']).agg(sales = ('sales', 'sum'),
price = ('price', 'mean')).reset_index()
return df
def add_zero_points(self, df, pred_date=None):
if pred_date is None:
pred_date = df['date'].max()
i = 0
all_dates = []
while True:
cur_month = add_months(datetime.fromisoformat(self.min_date), i).isoformat()
all_dates.append(cur_month)
if cur_month == pred_date:
break
i += 1
x = df.groupby(['store', 'item'])['date'].unique()
add = []
for store_item, now_dates in tqdm(list(x.items())):
for d in all_dates:
if d not in now_dates:
add.append({'store': store_item[0],
'item': store_item[1],
'date': d,
'sales': 0})
df = pd.concat([df, pd.DataFrame(add)])
return df
def fill_price(self, prices):
L = np.ones(len(prices)) * -1
R = np.ones(len(prices)) * -1
for i in range(len(prices)):
if prices[i] == prices[i]: #not is nan
L[i] = prices[i]
elif i > 0:
L[i] = L[i - 1]
for i in range(len(prices) - 1, -1, -1):
if prices[i] == prices[i]: #not is nan
R[i] = prices[i]
elif i != len(prices) - 1:
R[i] = R[i + 1]
for i in range(len(prices)):
if prices[i] != prices[i]:
if L[i] == -1:
prices[i] = R[i]
elif R[i] == -1:
prices[i] = L[i]
else:
prices[i] = (L[i] + R[i]) / 2
return prices
def fix_prices(self, df):
groups = df.sort_values(by='date').groupby(['store', 'item'])['price']
res = []
for group in tqdm(groups):
res += self.fill_price(group[1].values).tolist()
df.sort_values(by=['store','item', 'date'], axis=0, inplace=True)
df['price'] = res
return df
def get_product_info(self, path):
products_info = pd.read_csv(path)
products_info['item'] = products_info['gtin'].apply(lambda x: self.hash_to_numbers['item'][x] if x in
self.hash_to_numbers['item'] else np.nan)
products_info = products_info.dropna(subset='item')
products_info = products_info.drop(columns='gtin')
products_info = products_info.drop_duplicates(subset='item', keep='last')
products_info['volume'] = products_info['volume'].replace('НЕ КЛАССИФИЦИРОВАНО', np.nan)
products_info['volume'] = products_info['volume'].apply(lambda x: float(x.replace(',', '.').replace(' г', ''))
if x == x else np.nan)
return products_info
def get_store_info(self, path):
store_info = pd.read_csv(path).drop(columns='inn')
store_info['store'] = store_info['id_sp_'].apply(lambda x: self.hash_to_numbers['store'][x] if x in
self.hash_to_numbers['store'] else np.nan)
store_info = store_info.dropna(subset='store')
store_info = store_info.drop(columns='id_sp_')
store_info = store_info.drop_duplicates(subset='store', keep='last')
return store_info
def build_features(self, df):
df['date'] = pd.to_datetime(df['date'])
df['month'] = df.date.dt.month
df['year'] = df.date.dt.year
df = df.merge(self.get_store_info(self.store_info_path), on='store', how='left')
df = df.merge(self.get_product_info(self.products_info_path), on='item', how='left')
df = create_sales_lag_feats(df, gpby_cols=['store','item'], target_col='sales',
lags=[1, 3, 6, 12])
df = create_sales_rmean_feats(df, gpby_cols=['store','item'],
target_col='sales', windows=[2, 3, 6, 12],
min_periods=2, win_type='triang')
df = create_sales_rmed_feats(df, gpby_cols=['store','item'],
target_col='sales', windows=[2, 3, 6, 12],
min_periods=2, win_type=None)
df = create_sales_ewm_feats(df, gpby_cols=['store','item'],
target_col='sales',
alpha=[0.95, 0.9, 0.8, 0.7, 0.6, 0.5],
shift=[1, 3, 6, 12])
df = create_sales_rmean_feats(df, gpby_cols=['store','item'],
target_col='price', windows=[2, 3, 6, 12],
min_periods=2,)
df[self.cat_cols] = df[self.cat_cols].fillna('unknown').astype('str')
return df
def transform(self, data, pred_date=None):
df = self.get_df(data)
df = self.add_zero_points(df, pred_date)
self.mean_values = df.groupby(['store', 'item'])['sales'].mean()
df = self.fix_prices(df)
df = self.build_features(df)
return df
def full_solver(model, preprocesser, data, pred_date='2022-12-01'):
cnt = preprocesser.get_df(data).groupby(['store', 'item'])['sales'].count()
df_test = preprocesser.transform(data, pred_date)
pred_mask = df_test['date'] == pred_date
X_test, y_test = df_test.drop(columns=preprocesser.drop_cols), df_test['sales']
preds = model.predict(X_test)
df_test.loc[:, 'store'] = df_test['store'].astype(int)
df_test.loc[:, 'item'] = df_test['item'].astype(int)
mask = df_test.apply(lambda x: cnt[(x['store'], x['item'])], axis=1) < 10
if mask.sum():
# print(df_test[mask].apply(lambda x: preprocesser.mean_values[(x['store'], x['item'])], axis=1))
preds[mask] = df_test[mask].apply(lambda x: preprocesser.mean_values[(x['store'], x['item'])], axis=1)
preds = np.around(preds)
print('mae:', mean_absolute_error(y_test[~pred_mask], preds[~pred_mask]))
# print('smape:', smape(preds[~pred_mask], y_test[~pred_mask]))
res = df_test[['store', 'item', 'date']]
res.loc[:, 'preds'] = preds
res.loc[:, 'store'] = res['store'].apply(lambda x: preprocesser.numbers_to_hash['store'][x])
res.loc[:, 'item'] = res['item'].apply(lambda x: preprocesser.numbers_to_hash['item'][x])
return res[pred_mask]
if __name__ == '__main__':
with open('model.pickle', 'rb') as f:
model = pickle.load(f)
with open('preprocesser.pickle', 'rb') as f:
preprocesser = pickle.load(f)
data = pd.read_csv('sample_input.csv')
res = full_solver(model, preprocesser, data, '2022-12-01')
print(res.head())

65596
main.ipynb Normal file

File diff suppressed because one or more lines are too long

BIN
model.pickle Normal file

Binary file not shown.

BIN
preprocesser.pickle Normal file

Binary file not shown.

7
requirements.txt Normal file
View File

@ -0,0 +1,7 @@
catboost==1.1.1
cloudpickle==2.2.1
numpy==1.23.5
pandas==1.5.2
pickleshare==0.7.5
scikit-learn==1.2.0
tqdm==4.64.1