eis/py/glitch/GlitchDetector.py

import pandas as pd
import numpy as np
from scipy.stats import kurtosis

class GlitchDetector:
    def __init__(self, historical_data=None, kurtosis_threshold=3.0):
        """
        毛刺检测器初始化

        参数:
        historical_data: DataFrame, 历史正常数据，包含"data"列
        kurtosis_threshold: float, 峰度异常阈值（标准差倍数）
        """
        self.historical_data = historical_data
        self.kurtosis_threshold = kurtosis_threshold
        self.iqr_params = {}  # 存储IQR参数
        self.baseline_kurtosis = None  # 基准峰度值
        self.lb=0
        self.ub=0
        self.va=0
    def iqr_normalize(self, df, column='data'):
        """
        使用IQR方法对数据进行归一化处理 [1,2,3](@ref)

        参数:
        df: DataFrame, 输入数据
        column: str, 数据列名

        返回:
        DataFrame, 归一化后的数据
        """
        if df.empty:
            return df

        data_series = df[column]

        # 计算IQR参数
        Q1 = data_series.quantile(0.25)
        Q3 = data_series.quantile(0.75)
        IQR = Q3 - Q1
        median = data_series.median()

        # 存储参数供后续使用
        self.iqr_params = {'Q1': Q1, 'Q3': Q3, 'IQR': IQR, 'median': median}

        # IQR归一化: (x - median) / IQR
        normalized_data = (data_series - median) / IQR

        result_df = df.copy()
        result_df[f'{column}_normalized'] = normalized_data

        return result_df

    def calculate_kurtosis(self, df, column='data_normalized'):
        """
        计算数据的峰度（峭度）[5](@ref)

        参数:
        df: DataFrame, 输入数据
        column: str, 数据列名

        返回:
        float, 峰度值
        """
        if df.empty:
            return 0

        data_series = df[column].dropna()
        if len(data_series) < 4:  # 峰度计算需要至少4个数据点
            return 0

        # 计算峰度（使用Fisher的定义，正态分布的峰度为0）
        kurt = kurtosis(data_series, fisher=True)
        return kurt

    def establish_baseline(self, historical_df, column='data'):
        """
        基于历史正常数据建立基准

        参数:
        historical_df: DataFrame, 历史正常数据
        column: str, 数据列名
        """
        if historical_df is None or historical_df.empty:
            raise ValueError("历史数据不能为空")

        # 对历史数据进行IQR归一化
        normalized_historical = self.iqr_normalize(historical_df, column)

        # 计算历史数据的峰度作为基准
        self.baseline_kurtosis = self.calculate_kurtosis(normalized_historical)

        # print(f"基准峰度值建立完成: {self.baseline_kurtosis:.4f}")

    def detect_glitches(self, current_df, column='data',baseline_kurtosis =None):
        """
        检测当前数据中的毛刺

        参数:
        current_df: DataFrame, 当前数据
        column: str, 数据列名

        返回:
        dict, 检测结果
        """
        if baseline_kurtosis is not None:
            self.baseline_kurtosis=baseline_kurtosis
        if self.baseline_kurtosis is None and self.historical_data is not None:
            self.establish_baseline(self.historical_data, column)
        elif self.baseline_kurtosis is None:
            raise ValueError("未提供历史数据或未建立基准")

        # 对当前数据进行IQR归一化
        normalized_current = self.iqr_normalize(current_df, column)

        # 计算当前数据的峰度
        current_kurtosis = self.calculate_kurtosis(normalized_current)

        # 计算与基准的偏差
        kurtosis_deviation = abs(current_kurtosis - self.baseline_kurtosis)
        self.va=current_kurtosis
        self.lb=-abs(self.kurtosis_threshold*self.baseline_kurtosis)
        self.ub=abs(self.kurtosis_threshold*self.baseline_kurtosis)
        # 判断是否存在毛刺
        # has_glitch = abs (current_kurtosis) > abs(self.kurtosis_threshold*self.baseline_kurtosis)
        has_glitch = abs(self.va)>self.ub

        # 构建结果
        result = {
            'has_glitch': has_glitch,
            'current_kurtosis': current_kurtosis,
            'baseline_kurtosis': self.baseline_kurtosis,
            'deviation': kurtosis_deviation,
            'threshold': self.kurtosis_threshold,
            'normalized_data': normalized_current,
            'iqr_params': self.iqr_params
        }

        return result


# 使用示例
def example_usage():
    """
    使用示例
    """
    # 生成示例数据
    np.random.seed(42)

    # 历史正常数据（无毛刺）
    n_points = 200
    time_index = pd.date_range('2024-01-01', periods=n_points, freq='h')


    t = np.linspace(0, 4*np.pi, n_points)
    historical_data =  t + np.random.normal(0, 0.5, n_points)
    historical_df = pd.DataFrame({'data': historical_data}, index=time_index[:n_points])

    # 当前数据（包含一些毛刺）
    current_data = historical_data.copy()
    # 添加一些毛刺
    glitch_indices = [10, 30, 50, 70,90,110, 130, 150, 170]
    for idx in glitch_indices:
        current_data[idx] += 15 * np.random.randn()  # 添加随机毛刺

    current_df = pd.DataFrame({'data': current_data}, index=time_index[:n_points])

    # 创建检测器并检测毛刺
    detector = GlitchDetector(historical_df, kurtosis_threshold=20.0)

    try:
        result = detector.detect_glitches(current_df,baseline_kurtosis=0.08)

        # 打印结果
        print("=== 毛刺检测结果 ===")
        print(f"是否存在毛刺: {result['has_glitch']}")
        print(f"当前数据峰度: {result['current_kurtosis']:.4f}")
        print(f"基准峰度: {result['baseline_kurtosis']:.4f}")
        print(f"峰度偏差: {result['deviation']:.4f}")
        print(f"检测阈值: {result['threshold']}")

        return result

    except Exception as e:
        print(f"检测过程中出错: {e}")
        return None

# 批量处理函数
def batch_glitch_detection(historical_df, test_dfs, column='data', threshold=3.0):
    """
    批量进行毛刺检测

    参数:
    historical_df: DataFrame, 历史正常数据
    test_dfs: list, 多个测试数据的DataFrame列表
    column: str, 数据列名
    threshold: float, 检测阈值

    返回:
    DataFrame, 批量检测结果
    """
    detector = GlitchDetector(historical_df, kurtosis_threshold=threshold)

    results = []
    for i, test_df in enumerate(test_dfs):
        try:
            result = detector.detect_glitches(test_df, column)
            result['batch_id'] = i
            result['data_points'] = len(test_df)
            results.append(result)
        except Exception as e:
            print(f"第 {i} 个数据集检测失败: {e}")

    # 转换为DataFrame
    summary_df = pd.DataFrame([{
        'batch_id': r['batch_id'],
        'has_glitch': r['has_glitch'],
        'current_kurtosis': r['current_kurtosis'],
        'deviation': r['deviation'],
        'data_points': r['data_points']
    } for r in results])

    return summary_df

if __name__ == "__main__":
    # 运行示例
    result = example_usage()