#include #include namespace DAA { using namespace boost::math; Dist::Dist() { logger_ = std::make_unique("DAA::Dist"); } Dist::~Dist() {} // 得到四分点的距离信息 // 解压数据的四分位Q 与分布模型的四分位q 的距离 // dist = |Q1-q1|+ |Q2-q2|+ |Q3-q3| double get_quartile_distance(double p11, double p12, double p21, double p22, double p31, double p32) { return sqrt(pow(p11 - p12, 2) + pow(p21 - p22, 2) + pow(p31 - p32, 2)); } // 得到T分布的四分点的距离信息 double get_st_quartile_distance(boost::math::students_t dist, double mean, double stddev, double df, double p1, double p2, double p3) { double t1 = quantile(dist, 0.75); double w1 = t1 * stddev / sqrt(df + 1); return sqrt(pow(mean - w1 - p1, 2) + pow(mean - p2, 2) + pow(mean + w1 - p1, 2)); } Dist::DistTypes Dist::get_distribution_type() const { return this->dist_type_; } int Dist::set_predefined_prob(double prob) { this->predefied_prob_ = prob; return 0; } double Dist::get_shifted_prob() const { return prob_; } /** * @brief 检测分布 * @param rs 统计特征值 * @param tmp_data !从小到大排列的分布数据 * @return int */ int Dist::auto_test(dlib::running_stats rs, const std::vector& tmp_data) { try { this->rs_ = rs; std::tuple> test; // 如果数据重要特征为0或者数据量太小,则认为是无效数据 // 如果数据标准差过大,则也认为是无效数据 if ((this->rs_.min() == 0 && this->rs_.max() == 0) || (this->rs_.stddev() == 0 || this->rs_.current_n() < 20) || (rs_.stddev() / rs_.mean() > 3)) { this->valid_ = false; this->dist_type_ = DistTypes::unknown; logger_->Error() << "样本无效!数据重要特征为0,或者数据量太小,数据标准差过大!" << "/n min:" << this->rs_.min() << ",max:" << this->rs_.max() << ",stddev:" << this->rs_.stddev() << ",current:" << this->rs_.current_n() << ",mean:" << this->rs_.mean() << endl; return -1; } else { this->valid_ = true; } // 如果验证数据非空,则对验证数据进行测试,找到最相似的分布 //无效数据也不进行计算 if (!tmp_data.empty() && this->valid_) { double p1, p2, p3; auto tmp_data_size = tmp_data.size(); // 得到四分点的数据 p1 = tmp_data[tmp_data_size * 1 / 4]; p2 = tmp_data[tmp_data_size * 2 / 4]; p3 = tmp_data[tmp_data_size * 3 / 4]; normal_ = std::make_shared(rs_.mean(), rs_.stddev()); if (rs_.skewness() > 0.1) { // 计算自由的 double df = boost::math::students_t::find_degrees_of_freedom( rs_.skewness(), 0.05, 0.05, rs_.stddev()); // 根据自由的构造 t分布 students_t_ = std::make_shared(df); // 构造偏态分布 skew_normal_ = std::make_shared( rs_.mean(), rs_.stddev(), rs_.skewness()); double distance_student = get_st_quartile_distance( *students_t_, rs_.mean(), rs_.stddev(), df, p1, p2, p3); double distance_skew_normal = get_quartile_distance( p1, quantile(*skew_normal_, 0.25), p2, quantile(*skew_normal_, 0.5), p3, quantile(*skew_normal_, 0.75)); double distance_normal = get_quartile_distance( p1, quantile(*normal_, 0.25), p2, quantile(*normal_, 0.5), p3, quantile(*normal_, 0.75)); // 用给定假设分布的数据 if (distance_normal <= distance_student && distance_normal <= distance_skew_normal) { this->dist_type_ = DistTypes::normal; logger_->Info() << "数据判定为正态分布!" << endl; } else if (distance_student <= distance_normal && distance_student <= distance_skew_normal) { this->dist_type_ = DistTypes::student_t; logger_->Info() << "数据判定为T分布!" << endl; } else if (distance_skew_normal <= distance_normal && distance_skew_normal <= distance_student) { this->dist_type_ = DistTypes::skew_normal; logger_->Info() << "数据判定为偏态分布!" << endl; } } else { logger_->Info() << "偏态系数小于等于0.1,数据判定为正态分布!" << endl; this->dist_type_ = DistTypes::normal; } } else { // 如果验证数据为空,则暂时认为是正态分布 this->dist_type_ = DistTypes::normal; normal_ = std::make_shared(rs_.mean(), rs_.stddev()); logger_->Info() << "验证数据为空,数据默认判断正态分布!" << endl; } this->prob_ = predefied_prob_; // 根据预设置信区间和输入数据,对输入数据进行验证,动态扩大置信区间 auto error_rate = get_error_rate_type_1(prob_, tmp_data); logger_->Debug() << "预设置信度:" << prob_ << ",首次第一类错误率:" << error_rate << endl; while (error_rate > 0.3) { if (prob_ < 0.99) { prob_ += 0.01; error_rate = get_error_rate_type_1(prob_, tmp_data); } else { break; } } // 得到置信范围 this->legal_range_ = get_range(prob_); logger_->Debug() << "最终置信度:" << prob_ << ",第一类错误率:" << error_rate << ",置信区间:[" << this->legal_range_.get_left() << "," << this->legal_range_.get_right() << "]" << endl; } catch (const std::exception& e) { logger_->Error() << "Dist::auto_test 异常!" << e.what() << ",location:" << BOOST_CURRENT_LOCATION << endl; this->valid_ = false; return -1; } return 0; } // 设置报警数据信息,如果存在确切的报警数据,该数据可以帮助我们更好地判断总体的数据类型 // int Dist::set_warning_sample_optional( // const std::vector& warning_sample) { // this->warning_sample_ = warning_sample_; // return 0; // } // 得到第一类错误的错误率(如果数据正确而被判断出错) double Dist::get_error_rate_type_1(const double& prob, const std::vector& tmp_data) { if (tmp_data.empty()) { return 0; } auto range = this->get_range(prob); size_t lt_left = 0; size_t gt_right = 0; for (auto x : tmp_data) { if (range.get_left() <= x + 0.1) { break; } lt_left++; } for (auto r_it = tmp_data.rbegin(); r_it != tmp_data.rend(); r_it++) { if (range.get_right() >= (*r_it) - 0.1) { break; } gt_right++; } logger_->Debug() << "lt_left:" << lt_left << ",gt_right:" << gt_right << ",tmp_data.size():" << tmp_data.size() << endl; return double((lt_left + gt_right)) / double(tmp_data.size()); ///<小于1的小数 } bool Dist::valid() const { return valid_; } mix_cc::float_range_t Dist::get_range() const { return this->legal_range_; } /** * @brief 计算置信区间 * @param prob 置信度 * @return mix_cc::float_range_t */ mix_cc::float_range_t Dist::get_range(double prob) { double dest_prob = prob / 2 + 0.5; try { if (this->dist_type_ == DistTypes::normal) { return mix_cc::float_range_t{quantile(*normal_, 1 - dest_prob), quantile(*normal_, dest_prob)}; } } catch (const std::exception& e) { std::throw_with_nested(mix_cc::Exception( -1, "legal range get error, distribution type: normal", BOOST_CURRENT_LOCATION)); } try { if (this->dist_type_ == DistTypes::skew_normal) { return mix_cc::float_range_t{ quantile(*skew_normal_, 1 - dest_prob), quantile(*skew_normal_, prob)}; // 2021-10-28 偏态 正态分布 } } catch (const std::exception& e) { std::throw_with_nested(mix_cc::Exception( -1, "legal range get error, distribution type: skew-normal", BOOST_CURRENT_LOCATION)); } try { if (this->dist_type_ == DistTypes::student_t) { return mix_cc::float_range_t{rs_.mean() - quantile(*students_t_, prob), rs_.mean() + quantile(*students_t_, prob)}; } } catch (const std::exception& e) { std::throw_with_nested(mix_cc::Exception( -1, "legal range get error, distribution type: student", BOOST_CURRENT_LOCATION)); } return mix_cc::float_range_t{0, 0}; } } // namespace DAA