mirror of
https://github.com/marking-hack/ML.git
synced 2024-11-24 18:23:46 +03:00
first commit
This commit is contained in:
parent
de9cfea142
commit
8ccbb4c062
266
inference_model.py
Normal file
266
inference_model.py
Normal file
|
@ -0,0 +1,266 @@
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
from tqdm import tqdm
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
import catboost
|
||||||
|
import pickle
|
||||||
|
from sklearn.metrics import mean_absolute_error
|
||||||
|
from sklearn.base import BaseEstimator, TransformerMixin
|
||||||
|
import calendar
|
||||||
|
from datetime import date
|
||||||
|
import warnings
|
||||||
|
warnings.filterwarnings("ignore")
|
||||||
|
|
||||||
|
# Creating sales lag features
|
||||||
|
def create_sales_lag_feats(df, gpby_cols, target_col, lags):
|
||||||
|
gpby = df.groupby(gpby_cols)
|
||||||
|
for i in lags:
|
||||||
|
df['_'.join([target_col, 'lag', str(i)])] = \
|
||||||
|
gpby[target_col].shift(i).values + np.random.normal(scale=1, size=(len(df),)) * 0
|
||||||
|
return df
|
||||||
|
|
||||||
|
# Creating sales rolling mean features
|
||||||
|
def create_sales_rmean_feats(df, gpby_cols, target_col, windows, min_periods=2,
|
||||||
|
shift=1, win_type=None):
|
||||||
|
gpby = df.groupby(gpby_cols)
|
||||||
|
for w in windows:
|
||||||
|
df['_'.join([target_col, 'rmean', str(w)])] = \
|
||||||
|
gpby[target_col].shift(shift).rolling(window=w,
|
||||||
|
min_periods=min_periods,
|
||||||
|
win_type=win_type).mean().values +\
|
||||||
|
np.random.normal(scale=1, size=(len(df),)) * 0
|
||||||
|
return df
|
||||||
|
|
||||||
|
# Creating sales rolling median features
|
||||||
|
def create_sales_rmed_feats(df, gpby_cols, target_col, windows, min_periods=2,
|
||||||
|
shift=1, win_type=None):
|
||||||
|
gpby = df.groupby(gpby_cols)
|
||||||
|
for w in windows:
|
||||||
|
df['_'.join([target_col, 'rmed', str(w)])] = \
|
||||||
|
gpby[target_col].shift(shift).rolling(window=w,
|
||||||
|
min_periods=min_periods,
|
||||||
|
win_type=win_type).median().values +\
|
||||||
|
np.random.normal(scale=1, size=(len(df),)) * 0
|
||||||
|
return df
|
||||||
|
|
||||||
|
# Creating sales exponentially weighted mean features
|
||||||
|
def create_sales_ewm_feats(df, gpby_cols, target_col, alpha=[0.9], shift=[1]):
|
||||||
|
gpby = df.groupby(gpby_cols)
|
||||||
|
for a in alpha:
|
||||||
|
for s in shift:
|
||||||
|
df['_'.join([target_col, 'lag', str(s), 'ewm', str(a)])] = \
|
||||||
|
gpby[target_col].shift(s).ewm(alpha=a).mean().values
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def add_months(sourcedate, months):
|
||||||
|
month = sourcedate.month - 1 + months
|
||||||
|
year = sourcedate.year + month // 12
|
||||||
|
month = month % 12 + 1
|
||||||
|
day = min(sourcedate.day, calendar.monthrange(year,month)[1])
|
||||||
|
return date(year, month, day)
|
||||||
|
|
||||||
|
|
||||||
|
class Preprocesser(BaseEstimator, TransformerMixin):
|
||||||
|
def __init__(self, products_info_path, store_info_path):
|
||||||
|
self.hash_to_numbers = {}
|
||||||
|
self.numbers_to_hash = {}
|
||||||
|
self.drop_cols = ['date', 'sales', 'year', 'product_name', 'month']
|
||||||
|
self.products_info_path = products_info_path
|
||||||
|
self.store_info_path = store_info_path
|
||||||
|
self.cat_cols = ['store', 'item'] + \
|
||||||
|
['inn', 'product_short_name', 'tnved', 'tnved10', 'brand',
|
||||||
|
'country', 'region_code', 'city_with_type', 'city_fias_id', 'postal_code']
|
||||||
|
self.mean_values = {}
|
||||||
|
self.min_date = '2021-11-01'
|
||||||
|
|
||||||
|
def fit(self, data):
|
||||||
|
data = data.copy()
|
||||||
|
data = data.rename(columns={'dt': 'date', 'gtin': 'item', 'id_sp_': 'store', 'cnt': 'sales'})
|
||||||
|
data = data.drop(columns='inn', axis=1)
|
||||||
|
data = data.dropna(subset='store')
|
||||||
|
for col in ['item', 'store', 'prid']:
|
||||||
|
self.hash_to_numbers[col] = {a: b for a, b in zip(np.unique(data[col]), np.arange(data[col].nunique()))}
|
||||||
|
self.numbers_to_hash[col] = {b: a for a, b in zip(np.unique(data[col]), np.arange(data[col].nunique()))}
|
||||||
|
|
||||||
|
return self
|
||||||
|
|
||||||
|
def get_df(self, data):
|
||||||
|
data = data.sort_values(by='dt').reset_index(drop=True)
|
||||||
|
data = data.rename(columns={'dt': 'date', 'gtin': 'item', 'id_sp_': 'store', 'cnt': 'sales'})
|
||||||
|
data = data.drop(columns='inn', axis=1)
|
||||||
|
data = data.dropna(subset='store')
|
||||||
|
|
||||||
|
for col in ['item', 'store', 'prid']:
|
||||||
|
data.loc[:, col] = data[col].apply(lambda x: self.hash_to_numbers[col][x] if x in self.hash_to_numbers[col] else np.nan)
|
||||||
|
data.loc[:, col] = data[col].astype('int')
|
||||||
|
|
||||||
|
data['date'] = data['date'].apply(lambda x: x[:-2] + '01')
|
||||||
|
df = data.groupby(['store', 'item', 'date']).agg(sales = ('sales', 'sum'),
|
||||||
|
price = ('price', 'mean')).reset_index()
|
||||||
|
return df
|
||||||
|
|
||||||
|
def add_zero_points(self, df, pred_date=None):
|
||||||
|
if pred_date is None:
|
||||||
|
pred_date = df['date'].max()
|
||||||
|
|
||||||
|
i = 0
|
||||||
|
all_dates = []
|
||||||
|
while True:
|
||||||
|
cur_month = add_months(datetime.fromisoformat(self.min_date), i).isoformat()
|
||||||
|
all_dates.append(cur_month)
|
||||||
|
if cur_month == pred_date:
|
||||||
|
break
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
x = df.groupby(['store', 'item'])['date'].unique()
|
||||||
|
add = []
|
||||||
|
for store_item, now_dates in tqdm(list(x.items())):
|
||||||
|
for d in all_dates:
|
||||||
|
if d not in now_dates:
|
||||||
|
add.append({'store': store_item[0],
|
||||||
|
'item': store_item[1],
|
||||||
|
'date': d,
|
||||||
|
'sales': 0})
|
||||||
|
|
||||||
|
df = pd.concat([df, pd.DataFrame(add)])
|
||||||
|
return df
|
||||||
|
|
||||||
|
def fill_price(self, prices):
|
||||||
|
L = np.ones(len(prices)) * -1
|
||||||
|
R = np.ones(len(prices)) * -1
|
||||||
|
for i in range(len(prices)):
|
||||||
|
if prices[i] == prices[i]: #not is nan
|
||||||
|
L[i] = prices[i]
|
||||||
|
elif i > 0:
|
||||||
|
L[i] = L[i - 1]
|
||||||
|
|
||||||
|
for i in range(len(prices) - 1, -1, -1):
|
||||||
|
if prices[i] == prices[i]: #not is nan
|
||||||
|
R[i] = prices[i]
|
||||||
|
elif i != len(prices) - 1:
|
||||||
|
R[i] = R[i + 1]
|
||||||
|
|
||||||
|
for i in range(len(prices)):
|
||||||
|
if prices[i] != prices[i]:
|
||||||
|
if L[i] == -1:
|
||||||
|
prices[i] = R[i]
|
||||||
|
elif R[i] == -1:
|
||||||
|
prices[i] = L[i]
|
||||||
|
else:
|
||||||
|
prices[i] = (L[i] + R[i]) / 2
|
||||||
|
return prices
|
||||||
|
|
||||||
|
def fix_prices(self, df):
|
||||||
|
groups = df.sort_values(by='date').groupby(['store', 'item'])['price']
|
||||||
|
|
||||||
|
res = []
|
||||||
|
for group in tqdm(groups):
|
||||||
|
res += self.fill_price(group[1].values).tolist()
|
||||||
|
df.sort_values(by=['store','item', 'date'], axis=0, inplace=True)
|
||||||
|
df['price'] = res
|
||||||
|
return df
|
||||||
|
|
||||||
|
def get_product_info(self, path):
|
||||||
|
products_info = pd.read_csv(path)
|
||||||
|
|
||||||
|
products_info['item'] = products_info['gtin'].apply(lambda x: self.hash_to_numbers['item'][x] if x in
|
||||||
|
self.hash_to_numbers['item'] else np.nan)
|
||||||
|
products_info = products_info.dropna(subset='item')
|
||||||
|
products_info = products_info.drop(columns='gtin')
|
||||||
|
products_info = products_info.drop_duplicates(subset='item', keep='last')
|
||||||
|
|
||||||
|
products_info['volume'] = products_info['volume'].replace('НЕ КЛАССИФИЦИРОВАНО', np.nan)
|
||||||
|
products_info['volume'] = products_info['volume'].apply(lambda x: float(x.replace(',', '.').replace(' г', ''))
|
||||||
|
if x == x else np.nan)
|
||||||
|
return products_info
|
||||||
|
|
||||||
|
def get_store_info(self, path):
|
||||||
|
store_info = pd.read_csv(path).drop(columns='inn')
|
||||||
|
store_info['store'] = store_info['id_sp_'].apply(lambda x: self.hash_to_numbers['store'][x] if x in
|
||||||
|
self.hash_to_numbers['store'] else np.nan)
|
||||||
|
store_info = store_info.dropna(subset='store')
|
||||||
|
store_info = store_info.drop(columns='id_sp_')
|
||||||
|
store_info = store_info.drop_duplicates(subset='store', keep='last')
|
||||||
|
return store_info
|
||||||
|
|
||||||
|
def build_features(self, df):
|
||||||
|
df['date'] = pd.to_datetime(df['date'])
|
||||||
|
df['month'] = df.date.dt.month
|
||||||
|
df['year'] = df.date.dt.year
|
||||||
|
|
||||||
|
df = df.merge(self.get_store_info(self.store_info_path), on='store', how='left')
|
||||||
|
df = df.merge(self.get_product_info(self.products_info_path), on='item', how='left')
|
||||||
|
|
||||||
|
df = create_sales_lag_feats(df, gpby_cols=['store','item'], target_col='sales',
|
||||||
|
lags=[1, 3, 6, 12])
|
||||||
|
|
||||||
|
df = create_sales_rmean_feats(df, gpby_cols=['store','item'],
|
||||||
|
target_col='sales', windows=[2, 3, 6, 12],
|
||||||
|
min_periods=2, win_type='triang')
|
||||||
|
|
||||||
|
df = create_sales_rmed_feats(df, gpby_cols=['store','item'],
|
||||||
|
target_col='sales', windows=[2, 3, 6, 12],
|
||||||
|
min_periods=2, win_type=None)
|
||||||
|
|
||||||
|
df = create_sales_ewm_feats(df, gpby_cols=['store','item'],
|
||||||
|
target_col='sales',
|
||||||
|
alpha=[0.95, 0.9, 0.8, 0.7, 0.6, 0.5],
|
||||||
|
shift=[1, 3, 6, 12])
|
||||||
|
|
||||||
|
df = create_sales_rmean_feats(df, gpby_cols=['store','item'],
|
||||||
|
target_col='price', windows=[2, 3, 6, 12],
|
||||||
|
min_periods=2,)
|
||||||
|
|
||||||
|
df[self.cat_cols] = df[self.cat_cols].fillna('unknown').astype('str')
|
||||||
|
return df
|
||||||
|
|
||||||
|
def transform(self, data, pred_date=None):
|
||||||
|
df = self.get_df(data)
|
||||||
|
df = self.add_zero_points(df, pred_date)
|
||||||
|
self.mean_values = df.groupby(['store', 'item'])['sales'].mean()
|
||||||
|
df = self.fix_prices(df)
|
||||||
|
df = self.build_features(df)
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def full_solver(model, preprocesser, data, pred_date='2022-12-01'):
|
||||||
|
cnt = preprocesser.get_df(data).groupby(['store', 'item'])['sales'].count()
|
||||||
|
df_test = preprocesser.transform(data, pred_date)
|
||||||
|
pred_mask = df_test['date'] == pred_date
|
||||||
|
|
||||||
|
X_test, y_test = df_test.drop(columns=preprocesser.drop_cols), df_test['sales']
|
||||||
|
preds = model.predict(X_test)
|
||||||
|
|
||||||
|
df_test.loc[:, 'store'] = df_test['store'].astype(int)
|
||||||
|
df_test.loc[:, 'item'] = df_test['item'].astype(int)
|
||||||
|
mask = df_test.apply(lambda x: cnt[(x['store'], x['item'])], axis=1) < 10
|
||||||
|
if mask.sum():
|
||||||
|
# print(df_test[mask].apply(lambda x: preprocesser.mean_values[(x['store'], x['item'])], axis=1))
|
||||||
|
preds[mask] = df_test[mask].apply(lambda x: preprocesser.mean_values[(x['store'], x['item'])], axis=1)
|
||||||
|
|
||||||
|
preds = np.around(preds)
|
||||||
|
print('mae:', mean_absolute_error(y_test[~pred_mask], preds[~pred_mask]))
|
||||||
|
# print('smape:', smape(preds[~pred_mask], y_test[~pred_mask]))
|
||||||
|
|
||||||
|
res = df_test[['store', 'item', 'date']]
|
||||||
|
res.loc[:, 'preds'] = preds
|
||||||
|
res.loc[:, 'store'] = res['store'].apply(lambda x: preprocesser.numbers_to_hash['store'][x])
|
||||||
|
res.loc[:, 'item'] = res['item'].apply(lambda x: preprocesser.numbers_to_hash['item'][x])
|
||||||
|
return res[pred_mask]
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
with open('model.pickle', 'rb') as f:
|
||||||
|
model = pickle.load(f)
|
||||||
|
|
||||||
|
with open('preprocesser.pickle', 'rb') as f:
|
||||||
|
preprocesser = pickle.load(f)
|
||||||
|
|
||||||
|
data = pd.read_csv('sample_input.csv')
|
||||||
|
res = full_solver(model, preprocesser, data, '2022-12-01')
|
||||||
|
print(res.head())
|
||||||
|
|
65596
main.ipynb
Normal file
65596
main.ipynb
Normal file
File diff suppressed because one or more lines are too long
BIN
model.pickle
Normal file
BIN
model.pickle
Normal file
Binary file not shown.
BIN
preprocesser.pickle
Normal file
BIN
preprocesser.pickle
Normal file
Binary file not shown.
7
requirements.txt
Normal file
7
requirements.txt
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
catboost==1.1.1
|
||||||
|
cloudpickle==2.2.1
|
||||||
|
numpy==1.23.5
|
||||||
|
pandas==1.5.2
|
||||||
|
pickleshare==0.7.5
|
||||||
|
scikit-learn==1.2.0
|
||||||
|
tqdm==4.64.1
|
Loading…
Reference in New Issue
Block a user