
2024 iFLYTEK A.I.开发者大赛-讯飞开放平台
TabNet: 模型也是我在这个比赛一个意外收获,这个模型在比赛之中可用。但是需要GPU资源,否则运行真的是太慢了。后面针对这个模型我会写出如何使用的方法策略。
比赛结束后有与其他两位选手聊天,他们都是对数据做了很多分析,有的甚至直接使用Lasso就work了,效果还挺不错的。特征工程无敌呀。
真个代码部分,了解下有关特征工程的部分就行了,模型部分可以慢慢消化。当作一个新的知识点学习吧。
直接上代码
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import KFold
from pytorch_tabnet.metrics import Metric
from pytorch_tabnet.tab_model import TabNetRegressor
import torch
from torch.optim import Adam, SGD
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingWarmRestarts
from sklearn.metrics import mean_absolute_error
import traceback
import warnings
warnings.filterwarnings("ignore")
plt.rcParams['font.sans-serif'] = ['PingFang HK'] # 用来正常显示中文标签
plt.rcParams["axes.Unicode_minus"] = False # 该语句解决图像中的“-”负号的乱码问题
pd.set_option('precision', 10)
pd.set_option('display.max_rows', None)
# 时间解析模块
def parse_date(train_df=None):
train_df['datetime'] = pd.to_datetime(train_df['时间'])
train_df['timestamp'] = train_df['datetime'].astype('int64') / 10000000
train_df['year'] = train_df['datetime'].dt.year
train_df['month'] = train_df['datetime'].dt.month
train_df['day'] = train_df['datetime'].dt.day
train_df['hour'] = train_df['datetime'].dt.hour
train_df["minute"] = train_df['datetime'].dt.minute
train_df['dayofweek'] = train_df['datetime'].dt.dayofweek
# train_df['datetime'].dt.dayofmonth
return train_df
def same_position_tempture_resid(train_df, index=[]):
for i in index:
train_df[f'下部温度{i}_resid'] = train_df[f'下部温度{i}'] - train_df[f'下部温度设定{i}']
train_df[f'下部温度{i}_dist_4'] = train_df[f'下部温度设定4'] - train_df[f'下部温度设定{i}']
train_df[f'下部温度{i}_dist_4_moth_100'] = (train_df[f'下部温度{i}_dist_4'] >= 99) * 1
return train_df
df_train = pd.read_csv("../data/train.csv")
df_test = pd.read_csv("../data/test.csv")
submit = pd.read_csv("../data/submit.csv")
df_train = parse_date(df_train)
df_test = parse_date(df_test)
df_train = df_train.sort_values("datetime")
df_train = df_train.reset_index(drop=True)
df_train['train'] = 1
df_train.loc[1057, '下部温度9'] = 829
df_test = df_test.sort_values("datetime")
df_test = df_test.reset_index(drop=True)
df_test['train'] = 0
flow_cols = [col for col in df_train.columns if "流量" in col]
up_temp_sets = [col for col in df_train.columns if "上部温度设定" in col]
down_temp_sets = [col for col in df_train.columns if "下部温度设定" in col]
up_tempture = [col for col in df_train.columns if "上部温度" in col and col not in up_temp_sets]
down_tempture = [col for col in df_train.columns if "下部温度" in col and col not in down_temp_sets]
# train_df.columns.tolist()
import re
small_cols = ['下部温度5', '上部温度8', '上部温度9',
'上部温度10',
'上部温度11',
'上部温度12',
'上部温度13',
'上部温度14',
'上部温度15',
'上部温度16',
'上部温度17',
'下部温度3',
'下部温度4',
'下部温度6',
'下部温度7',
'下部温度8',
'下部温度9',
'下部温度10',
'下部温度11',
'下部温度12',
'下部温度13',
'下部温度14',
'下部温度15',
'下部温度16',
'下部温度17'] + [
'上部温度1',
'上部温度2',
'上部温度3',
'上部温度4',
'上部温度5',
'上部温度6',
'上部温度7',
'下部温度1',
'下部温度2',
]
def get_same_temp(test_df, cols):
for col in cols:
nums = re.findall("\d+", col)
num = nums[0]
if "上部温度" in col:
print(num, col)
test_df[col] = test_df[f'上部温度设定{num}']
elif "下部温度" in col:
test_df[col] = test_df[f'下部温度设定{num}']
return test_df
df_test = get_same_temp(df_test, small_cols)
df = pd.concat([df_train, df_test])
df = df.sort_values(['year', 'month', 'day', 'hour', "minute"])
df = df.reset_index(drop=True)
down_label = ['下部温度1', '下部温度2', '下部温度3']
up_label = ['上部温度7', '上部温度1', '上部温度2', '上部温度3', '上部温度4', '上部温度5', '上部温度6']
cat_cols = ['year', 'month', 'day', 'hour', 'minute', 'dayofweek']
keep_cols = df_test.columns.tolist()
def resid_model(y, y_pred):
# residual plots
y_pred = pd.Series(y_pred, index=y.index)
resid = y - y_pred
mean_resid = resid.mean()
std_resid = resid.std()
z = abs(resid) / (y + 0.01)
# print(z)
n_outliers = sum(abs(resid) > 10000)
outliers = y[(abs(resid) > 10000)].index
print(outliers)
plt.figure(figsize=(15, 5))
ax_131 = plt.subplot(1, 3, 1)
plt.plot(y, y_pred, '.')
plt.xlabel('y')
plt.ylabel('y_pred');
plt.title('corr = {:.3f}'.format(np.corrcoef(y, y_pred)[0][1]))
ax_132 = plt.subplot(1, 3, 2)
plt.plot(y, y - y_pred, '.')
plt.xlabel('y')
plt.ylabel('y - y_pred');
plt.title('std resid = {:.3f}'.format(std_resid))
ax_133 = plt.subplot(1, 3, 3)
z.plot.hist(bins=50, ax=ax_133)
plt.xlabel('z')
plt.title('{:.0f} samples with z>3'.format(n_outliers))
plt.show()
# return outliers
def get_down_tempture_sets_resid(df, diffed_col="下部温度设定4",
diff_col='下部温度设定1'):
distacnce = 0
if "上部" in diff_col:
print(f"----- {diff_col}_diff_{diffed_col}")
df['上部温度设定4_diff_上部温度设定1'] = df['上部温度设定4'] - df['上部温度设定1']
df[f'{diffed_col}_diff_{diff_col}'] = df[diffed_col] - df[diff_col]
df['上部温度设定4_div_上部温度设定1'] = df['上部温度设定4'] / df['上部温度设定1']
df[f'{diffed_col}_div_{diff_col}'] = df[diffed_col] / df[diff_col]
df['flag'] = (df['上部温度设定4_diff_上部温度设定1'] > 300) * 1
else:
df['下部温度设定4_diff_下部温度设定1'] = df['下部温度设定4'] - df['下部温度设定1']
df['下部温度设定4_div_下部温度设定1'] = df['下部温度设定4'] / df['下部温度设定1']
df[f'{diffed_col}_diff_{diff_col}'] = df[diffed_col] - df[diff_col]
df[f'{diffed_col}_div_{diff_col}'] = df[diffed_col] / df[diff_col]
distacnce = 300
df['flag'] = (df['下部温度设定4_diff_下部温度设定1'] > 300) * 1
return df
def get_same_type_tempure(row, df_train, label, woindows):
try:
heads = woindows
train_flag = int(row['train'])
hour = row['hour']
minute = row['minute']
timesamp = row['timestamp']
flag = row['flag']
nums = re.findall("\d+", label)
num = int(nums[0])
chars = re.findall("(\w+)(\d+)", label)[0][0]
label_map_set_col = f"{chars}设定{num}"
set_temps = row[label_map_set_col]
# (df_train[label_map_set_col]==set_temps)&
df_temp_ = df_train[
(df_train[label_map_set_col] == set_temps) & (df_train['flag'] == flag) & (df_train['hour'] == hour) & (
df_train['timestamp']








