#include #include namespace distribution { using namespace boost::math; Dist::Dist(/* args */) {} Dist::~Dist() {} // 得到四分点的距离信息 // 解压数据的四分位Q 与分布模型的四分位q 的距离 // dist = |Q1-q1|+ |Q2-q2|+ |Q3-q3| double get_quartile_distance(double p11, double p12, double p21, double p22, double p31, double p32) { return sqrt(pow(p11 - p12, 2) + pow(p21 - p22, 2) + pow(p31 - p32, 2)); } // 得到T分布的四分点的距离信息 double get_st_quartile_distance(boost::math::students_t dist, double mean, double stddev, double df, double p1, double p2, double p3) { double t1 = quantile(dist, 0.75); double w1 = t1 * stddev / sqrt(df + 1); return sqrt(pow(mean - w1 - p1, 2) + pow(mean - p2, 2) + pow(mean + w1 - p1, 2)); } Dist::DistTypes Dist::get_distribution_type() const { return this->dist_type_; } int Dist::set_predefined_prob(double prob) { this->predefied_prob_ = prob; return 0; } double Dist::get_shifted_prob() const { return prob_; } /** * @brief 检测分布 * @param rs 统计特征值 * @param tmp_data !从小到大排列的分布数据 * @return int */ int Dist::auto_test(dlib::running_stats rs, const std::vector& tmp_data) { try { this->rs_ = rs; std::tuple> test; // 如果数据重要特征为0或者数据量太小,则认为是无效数据 // 如果数据标准差过大,则也认为是无效数据 if ((this->rs_.min() == 0 && this->rs_.max() == 0) || (this->rs_.stddev() == 0 || this->rs_.current_n() < 20) || (rs_.stddev() / rs_.mean() > 3)) { this->valid_ = false; this->dist_type_ = DistTypes::unknown; return 0; } else { this->valid_ = true; } // 如果验证数据非空,则对验证数据进行测试,找到最相似的分布 //无效数据也不进行计算 if (!tmp_data.empty() && this->valid_) { double p1, p2, p3; auto tmp_data_size = tmp_data.size(); // 得到四分点的数据 p1 = tmp_data[tmp_data_size * 1 / 4]; p2 = tmp_data[tmp_data_size * 2 / 4]; p3 = tmp_data[tmp_data_size * 3 / 4]; normal_ = std::make_shared(rs_.mean(), rs_.stddev()); if (rs_.skewness() > 0.1) { // 计算自由的 double df = boost::math::students_t::find_degrees_of_freedom( rs_.skewness(), 0.05, 0.05, rs_.stddev()); // 根据自由的构造 t分布 students_t_ = std::make_shared(df); // 构造偏态分布 skew_normal_ = std::make_shared( rs_.mean(), rs_.stddev(), rs_.skewness()); double distance_student = get_st_quartile_distance( *students_t_, rs_.mean(), rs_.stddev(), df, p1, p2, p3); double distance_skew_normal = get_quartile_distance( p1, quantile(*skew_normal_, 0.25), p2, quantile(*skew_normal_, 0.5), p3, quantile(*skew_normal_, 0.75)); double distance_normal = get_quartile_distance( p1, quantile(*normal_, 0.25), p2, quantile(*normal_, 0.5), p3, quantile(*normal_, 0.75)); // 用给定假设分布的数据 if (distance_normal <= distance_student && distance_normal <= distance_skew_normal) { this->dist_type_ = DistTypes::normal; } else if (distance_student <= distance_normal && distance_student <= distance_skew_normal) { this->dist_type_ = DistTypes::student_t; } else if (distance_skew_normal <= distance_normal && distance_skew_normal <= distance_student) { this->dist_type_ = DistTypes::skew_normal; } } else { this->dist_type_ = DistTypes::normal; } } else { // 如果验证数据为空,则暂时认为是正态分布 this->dist_type_ = DistTypes::normal; normal_ = std::make_shared(rs_.mean(), rs_.stddev()); } this->prob_ = predefied_prob_; // 根据预设置信区间和输入数据,对输入数据进行验证,动态扩大置信区间 auto error_rate = get_error_rate_type_1(prob_, tmp_data); while (error_rate > 0.3) { if (prob_ < 0.99) { prob_ += 0.01; error_rate = get_error_rate_type_1(prob_, tmp_data); } else { break; } } // 得到置信范围 this->legal_range_ = get_range(prob_); } catch (const std::exception& e) { std::throw_with_nested(mix_cc::Exception(-1, "distribution test error", BOOST_CURRENT_LOCATION)); } return 0; } // 设置报警数据信息,如果存在确切的报警数据,该数据可以帮助我们更好地判断总体的数据类型 int Dist::set_warning_sample_optional( const std::vector& warning_sample) { this->warning_sample_ = warning_sample_; return 0; } // 得到第一类错误的错误率(如果数据正确而被判断出错) double Dist::get_error_rate_type_1(const double& prob, const std::vector& tmp_data) { if (tmp_data.empty()) { return 0; } auto range = this->get_range(prob); size_t lt_left, gt_right = 0; for (auto x : tmp_data) { if (range.get_left() <= x + 0.1) { break; } lt_left++; } for (auto r_it = tmp_data.rbegin(); r_it != tmp_data.rend(); r_it++) { if (range.get_right() >= (*r_it) - 0.1) { break; } gt_right++; } return double((lt_left + gt_right)) / double(tmp_data.size()); ///<小于1的小数 } bool Dist::valid() const { return valid_; } mix_cc::float_range_t Dist::get_range() const { return this->legal_range_; } /** * @brief 计算置信区间 * @param prob 置信度 * @return mix_cc::float_range_t */ mix_cc::float_range_t Dist::get_range(double prob) { double dest_prob = prob / 2 + 0.5; try { if (this->dist_type_ == DistTypes::normal) { return mix_cc::float_range_t{quantile(*normal_, 1 - dest_prob), quantile(*normal_, dest_prob)}; } } catch (const std::exception& e) { std::throw_with_nested(mix_cc::Exception( -1, "legal range get error, distribution type: normal", BOOST_CURRENT_LOCATION)); } try { if (this->dist_type_ == DistTypes::skew_normal) { return mix_cc::float_range_t{ quantile(*skew_normal_, 1 - dest_prob), quantile(*skew_normal_, prob)}; // 2021-10-28 偏态 正态分布 } } catch (const std::exception& e) { std::throw_with_nested(mix_cc::Exception( -1, "legal range get error, distribution type: skew-normal", BOOST_CURRENT_LOCATION)); } try { if (this->dist_type_ == DistTypes::student_t) { return mix_cc::float_range_t{rs_.mean() - quantile(*students_t_, prob), rs_.mean() + quantile(*students_t_, prob)}; } } catch (const std::exception& e) { std::throw_with_nested(mix_cc::Exception( -1, "legal range get error, distribution type: student", BOOST_CURRENT_LOCATION)); } return mix_cc::float_range_t{0, 0}; } // namespace distribution } // namespace distribution