eis/eqpalg/feature_extraction/distribution.cc
2026-05-09 13:32:10 +08:00

226 lines
8.3 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include <eqpalg/feature_extraction/distribution.h>
#include <mix_cc/exception.h>
namespace DAA {
using namespace boost::math;
Dist::Dist() { logger_ = std::make_unique<LOG>("DAA::Dist"); }
Dist::~Dist() {}
// 得到四分点的距离信息
// 解压数据的四分位Q 与分布模型的四分位q 的距离
// dist = |Q1-q1|+ |Q2-q2|+ |Q3-q3|
double get_quartile_distance(double p11, double p12, double p21, double p22,
double p31, double p32) {
return sqrt(pow(p11 - p12, 2) + pow(p21 - p22, 2) + pow(p31 - p32, 2));
}
// 得到T分布的四分点的距离信息
double get_st_quartile_distance(boost::math::students_t dist, double mean,
double stddev, double df, double p1, double p2,
double p3) {
double t1 = quantile(dist, 0.75);
double w1 = t1 * stddev / sqrt(df + 1);
return sqrt(pow(mean - w1 - p1, 2) + pow(mean - p2, 2) +
pow(mean + w1 - p1, 2));
}
Dist::DistTypes Dist::get_distribution_type() const { return this->dist_type_; }
int Dist::set_predefined_prob(double prob) {
this->predefied_prob_ = prob;
return 0;
}
double Dist::get_shifted_prob() const { return prob_; }
/**
* @brief 检测分布
* @param rs 统计特征值
* @param tmp_data !从小到大排列的分布数据
* @return int
*/
int Dist::auto_test(dlib::running_stats<double> rs,
const std::vector<double>& tmp_data) {
try {
this->rs_ = rs;
std::tuple<double, std::string, int, std::vector<int>> test;
// 如果数据重要特征为0或者数据量太小则认为是无效数据
// 如果数据标准差过大,则也认为是无效数据
if ((this->rs_.min() == 0 && this->rs_.max() == 0) ||
(this->rs_.stddev() == 0 || this->rs_.current_n() < 20) ||
(rs_.stddev() / rs_.mean() > 3)) {
this->valid_ = false;
this->dist_type_ = DistTypes::unknown;
logger_->Error()
<< "样本无效数据重要特征为0或者数据量太小数据标准差过大"
<< "/n min:" << this->rs_.min() << ",max:" << this->rs_.max()
<< ",stddev:" << this->rs_.stddev()
<< ",current:" << this->rs_.current_n()
<< ",mean:" << this->rs_.mean() << endl;
return -1;
} else {
this->valid_ = true;
}
// 如果验证数据非空,则对验证数据进行测试,找到最相似的分布
//无效数据也不进行计算
if (!tmp_data.empty() && this->valid_) {
double p1, p2, p3;
auto tmp_data_size = tmp_data.size();
// 得到四分点的数据
p1 = tmp_data[tmp_data_size * 1 / 4];
p2 = tmp_data[tmp_data_size * 2 / 4];
p3 = tmp_data[tmp_data_size * 3 / 4];
normal_ = std::make_shared<boost::math::normal>(rs_.mean(), rs_.stddev());
if (rs_.skewness() > 0.1) {
// 计算自由的
double df = boost::math::students_t::find_degrees_of_freedom(
rs_.skewness(), 0.05, 0.05, rs_.stddev());
// 根据自由的构造 t分布
students_t_ = std::make_shared<boost::math::students_t>(df);
// 构造偏态分布
skew_normal_ = std::make_shared<boost::math::skew_normal>(
rs_.mean(), rs_.stddev(), rs_.skewness());
double distance_student = get_st_quartile_distance(
*students_t_, rs_.mean(), rs_.stddev(), df, p1, p2, p3);
double distance_skew_normal = get_quartile_distance(
p1, quantile(*skew_normal_, 0.25), p2, quantile(*skew_normal_, 0.5),
p3, quantile(*skew_normal_, 0.75));
double distance_normal = get_quartile_distance(
p1, quantile(*normal_, 0.25), p2, quantile(*normal_, 0.5), p3,
quantile(*normal_, 0.75));
// 用给定假设分布的数据
if (distance_normal <= distance_student &&
distance_normal <= distance_skew_normal) {
this->dist_type_ = DistTypes::normal;
logger_->Info() << "数据判定为正态分布!" << endl;
} else if (distance_student <= distance_normal &&
distance_student <= distance_skew_normal) {
this->dist_type_ = DistTypes::student_t;
logger_->Info() << "数据判定为T分布" << endl;
} else if (distance_skew_normal <= distance_normal &&
distance_skew_normal <= distance_student) {
this->dist_type_ = DistTypes::skew_normal;
logger_->Info() << "数据判定为偏态分布!" << endl;
}
} else {
logger_->Info() << "偏态系数小于等于0.1,数据判定为正态分布!" << endl;
this->dist_type_ = DistTypes::normal;
}
} else {
// 如果验证数据为空,则暂时认为是正态分布
this->dist_type_ = DistTypes::normal;
normal_ = std::make_shared<boost::math::normal>(rs_.mean(), rs_.stddev());
logger_->Info() << "验证数据为空,数据默认判断正态分布!" << endl;
}
this->prob_ = predefied_prob_;
// 根据预设置信区间和输入数据,对输入数据进行验证,动态扩大置信区间
auto error_rate = get_error_rate_type_1(prob_, tmp_data);
logger_->Debug() << "预设置信度:" << prob_ << ",首次第一类错误率:"
<< error_rate << endl;
while (error_rate > 0.3) {
if (prob_ < 0.99) {
prob_ += 0.01;
error_rate = get_error_rate_type_1(prob_, tmp_data);
} else {
break;
}
}
// 得到置信范围
this->legal_range_ = get_range(prob_);
logger_->Debug() << "最终置信度:" << prob_ << ",第一类错误率:"
<< error_rate << ",置信区间:["
<< this->legal_range_.get_left() << ","
<< this->legal_range_.get_right() << "]" << endl;
} catch (const std::exception& e) {
logger_->Error() << "Dist::auto_test 异常!" << e.what()
<< ",location:" << BOOST_CURRENT_LOCATION << endl;
this->valid_ = false;
return -1;
}
return 0;
}
// 得到第一类错误的错误率(如果数据正确而被判断出错)
double Dist::get_error_rate_type_1(const double& prob,
const std::vector<double>& tmp_data) {
if (tmp_data.empty()) {
return 0;
}
auto range = this->get_range(prob);
size_t lt_left = 0;
size_t gt_right = 0;
for (auto x : tmp_data) {
if (range.get_left() <= x + 0.1) {
break;
}
lt_left++;
}
for (auto r_it = tmp_data.rbegin(); r_it != tmp_data.rend(); r_it++) {
if (range.get_right() >= (*r_it) - 0.1) {
break;
}
gt_right++;
}
logger_->Debug() << "lt_left:" << lt_left << ",gt_right:" << gt_right
<< ",tmp_data.size():" << tmp_data.size() << endl;
return double((lt_left + gt_right)) /
double(tmp_data.size());
}
bool Dist::valid() const { return valid_; }
mix_cc::float_range_t Dist::get_range() const { return this->legal_range_; }
/**
* @brief 计算置信区间
* @param prob 置信度
* @return mix_cc::float_range_t
*/
mix_cc::float_range_t Dist::get_range(double prob) {
double dest_prob = prob / 2 + 0.5;
try {
if (this->dist_type_ == DistTypes::normal) {
return mix_cc::float_range_t{quantile(*normal_, 1 - dest_prob),
quantile(*normal_, dest_prob)};
}
} catch (const std::exception& e) {
std::throw_with_nested(mix_cc::Exception(
-1, "legal range get error, distribution type: normal",
BOOST_CURRENT_LOCATION));
}
try {
if (this->dist_type_ == DistTypes::skew_normal) {
return mix_cc::float_range_t{
quantile(*skew_normal_, 1 - dest_prob),
quantile(*skew_normal_, prob)}; // 2021-10-28 偏态 正态分布
}
} catch (const std::exception& e) {
std::throw_with_nested(mix_cc::Exception(
-1, "legal range get error, distribution type: skew-normal",
BOOST_CURRENT_LOCATION));
}
try {
if (this->dist_type_ == DistTypes::student_t) {
return mix_cc::float_range_t{rs_.mean() - quantile(*students_t_, prob),
rs_.mean() + quantile(*students_t_, prob)};
}
} catch (const std::exception& e) {
std::throw_with_nested(mix_cc::Exception(
-1, "legal range get error, distribution type: student",
BOOST_CURRENT_LOCATION));
}
return mix_cc::float_range_t{0, 0};
}
}