eis/py/dlearn/DataProcessing.py

113 lines
4.9 KiB
Python

#!/usr/bin/python3
#-*-coding: UTF-8 -*-
import Ice
import time
import logging as d
import faulthandler; faulthandler.enable()
from file.ConfigYaml import ConfigYaml
from EventNo import *
from mlearn.MLAnalyzer import MLAnalyzer
from mlearn.DataFrameCleaner import DataFrameCleaner
from mlearn.PandasDataIO import PandasDataIO
from db.DBOperator import DBOperator
from file.PathUtil import PathUtil
from sklearn.model_selection import train_test_split
class DataProcessing:
def __init__(self):
self.cfg = ConfigYaml()
self.cfg.load('config.yaml')
config = self.cfg.getProperties('db2')
self.path = PathUtil().getEnv('HOME') + '/data/'
d.info(config)
self.db2_db = DBOperator(
db_type="db2",
host=config['host'],
port=config['port'],
database=config['database'],
username=config['user'],
password=config['password'],
)
def dispatch(self, eventNo, seq):
if(eventNo == EventNo.PDI.value):
d.info(EventNo.PDI)
self.read(eventNo,seq)
elif(eventNo == EventNo.PDO.value):
d.info(EventNo.PDO)
else:
d.info("UNKNOWN EVENTNO")
def read(self, eventNo, seq):
d.info("eventNo:"+str(eventNo)+" read")
#db2_results = self.db2_db.execute_query("T_SOM_PSTA", {"entId": {"$like":'test%'}}, as_dataframe=True)
#db2_results = self.db2_db.execute_query("T_SOM_PDO",columns=['entId','steel_grade','thick','width','dthick','ent_speed_sv','ent_speed_pv','pro_speed_sv','pro_speed_pv','del_speed_sv','del_speed_pv'], as_dataframe=True)
db2_results = self.db2_db.execute_query("T_SOM_PDO",filters={"fault_slow_down" : {"$lt":1},"steel_grade" : 'JU6310E6'},
#db2_results = self.db2_db.execute_query("T_SOM_PDO",filters={"fault_slow_down" : {"$lt":1},"steel_grade" : 'DT0128D9'},
columns=['steel_grade','thick','width', 'ent_speed_sv','ent_speed_pv'], as_dataframe=True)
#columns=['steel_grade','thick','width','dthick','ent_speed_sv','ent_speed_pv','pro_speed_sv','pro_speed_pv','del_speed_sv','del_speed_pv'], as_dataframe=True)
d.info("DataFrame (with limited rows):\n%s", db2_results.head().to_string())
cleaned_df = (
DataFrameCleaner(db2_results)
# .normalize_strings('name', case='title', strip=True)
# .normalize_headers(case='lower')
.convert_types({'steel_grade': 'category'}) # 转换数据类型
.convert_types({'thick': 'category'}) # 转换数据类型
.convert_types({'width': 'category'}) # 转换数据类型
#.encode_categorical(['STEELGRADE'], method='label', drop=True)
).get_cleaned_data()
d.info("\n清洗后数据:")
d.info(cleaned_df)
d.info(cleaned_df.dtypes)
train_df, test_df = train_test_split(cleaned_df, test_size=0.2, random_state=42)
#X_train = train_df.drop(['steel_grade','thick','width','dthick','ent_speed_sv','ent_speed_pv','pro_speed_sv','pro_speed_pv','del_speed_sv','del_speed_pv'], axis=1)
X_train = train_df.drop(['ent_speed_pv'], axis=1, errors="ignore")
y_train = train_df[['ent_speed_pv']]
X_test = test_df.drop(['ent_speed_pv'], axis=1, errors="ignore")
y_test = test_df[['ent_speed_pv']]
'''
X_train = train_df.drop(['ent_speed_sv','ent_speed_pv','pro_speed_sv','pro_speed_pv','del_speed_sv','del_speed_pv'], axis=1)
y_train = train_df[['ent_speed_sv','ent_speed_pv','pro_speed_sv','pro_speed_pv','del_speed_sv','del_speed_pv']]
X_test = test_df.drop(['ent_speed_sv','ent_speed_pv','pro_speed_sv','pro_speed_pv','del_speed_sv','del_speed_pv'], axis=1)
y_test = test_df[['ent_speed_sv','ent_speed_pv','pro_speed_sv','pro_speed_pv','del_speed_sv','del_speed_pv']]
'''
# 回归分析
d.info("=== 回归分析 ===")
analyzer_reg = MLAnalyzer('random_forest_reg')
#analyzer_reg = MLAnalyzer('linear_regression', scaler_type='standard')
# analyzer_reg = MLAnalyzer('xgboost_reg')
# analyzer_reg = MLAnalyzer('mlp_regressor')
#analyzer_reg = MLAnalyzer('logistic_regression')
metrics = analyzer_reg.fit(X_train, y_train)
d.info("模型性能:%s",metrics)
d.info("特征重要性:\n%s",analyzer_reg.get_feature_importance().to_string())
analyzer_reg.save_model(self.path + 'som_model.pkl')
# analyzer_reg = MLAnalyzer.load_model('model.pkl')
d.info("X_test (with limited rows):\n%s", X_test[10:12])
# 模型预测
predictions = analyzer_reg.predict(X_test)
d.info("\n预测结果(前5个):")
d.info("predictions (with limited rows):\n%s", predictions[:5])
#analyzer_reg.save_prediction_results( X_test, y_test, path + 'prediction_results.csv')
#analyzer_reg.plot_predictions(X_test, y_test, 60, save_path = path + 'prediction_results.png')
# 模型评估
d.info("\n=== 模型评估 ===")
test_metrics = analyzer_reg.evaluate(X_test, y_test)
d.info("测试集评估指标:")
d.info(test_metrics)