eis/eqpalg/feature_extraction/distribution.cc

233 lines
8.7 KiB
C++
Raw Normal View History

#include <eqpalg/feature_extraction/distribution.h>
#include <mix_cc/exception.h>
namespace DAA {
using namespace boost::math;
Dist::Dist() { logger_ = std::make_unique<LOG>("DAA::Dist"); }
Dist::~Dist() {}
// 得到四分点的距离信息
// 解压数据的四分位Q 与分布模型的四分位q 的距离
// dist = |Q1-q1|+ |Q2-q2|+ |Q3-q3|
double get_quartile_distance(double p11, double p12, double p21, double p22,
double p31, double p32) {
return sqrt(pow(p11 - p12, 2) + pow(p21 - p22, 2) + pow(p31 - p32, 2));
}
// 得到T分布的四分点的距离信息
double get_st_quartile_distance(boost::math::students_t dist, double mean,
double stddev, double df, double p1, double p2,
double p3) {
double t1 = quantile(dist, 0.75);
double w1 = t1 * stddev / sqrt(df + 1);
return sqrt(pow(mean - w1 - p1, 2) + pow(mean - p2, 2) +
pow(mean + w1 - p1, 2));
}
Dist::DistTypes Dist::get_distribution_type() const { return this->dist_type_; }
int Dist::set_predefined_prob(double prob) {
this->predefied_prob_ = prob;
return 0;
}
double Dist::get_shifted_prob() const { return prob_; }
/**
* @brief
* @param rs
* @param tmp_data
* @return int
*/
int Dist::auto_test(dlib::running_stats<double> rs,
const std::vector<double>& tmp_data) {
try {
this->rs_ = rs;
std::tuple<double, std::string, int, std::vector<int>> test;
// 如果数据重要特征为0或者数据量太小则认为是无效数据
// 如果数据标准差过大,则也认为是无效数据
if ((this->rs_.min() == 0 && this->rs_.max() == 0) ||
(this->rs_.stddev() == 0 || this->rs_.current_n() < 20) ||
(rs_.stddev() / rs_.mean() > 3)) {
this->valid_ = false;
this->dist_type_ = DistTypes::unknown;
logger_->Error()
<< "样本无效数据重要特征为0或者数据量太小数据标准差过大"
<< "/n min:" << this->rs_.min() << ",max:" << this->rs_.max()
<< ",stddev:" << this->rs_.stddev()
<< ",current:" << this->rs_.current_n()
<< ",mean:" << this->rs_.mean() << endl;
return -1;
} else {
this->valid_ = true;
}
// 如果验证数据非空,则对验证数据进行测试,找到最相似的分布
//无效数据也不进行计算
if (!tmp_data.empty() && this->valid_) {
double p1, p2, p3;
auto tmp_data_size = tmp_data.size();
// 得到四分点的数据
p1 = tmp_data[tmp_data_size * 1 / 4];
p2 = tmp_data[tmp_data_size * 2 / 4];
p3 = tmp_data[tmp_data_size * 3 / 4];
normal_ = std::make_shared<boost::math::normal>(rs_.mean(), rs_.stddev());
if (rs_.skewness() > 0.1) {
// 计算自由的
double df = boost::math::students_t::find_degrees_of_freedom(
rs_.skewness(), 0.05, 0.05, rs_.stddev());
// 根据自由的构造 t分布
students_t_ = std::make_shared<boost::math::students_t>(df);
// 构造偏态分布
skew_normal_ = std::make_shared<boost::math::skew_normal>(
rs_.mean(), rs_.stddev(), rs_.skewness());
double distance_student = get_st_quartile_distance(
*students_t_, rs_.mean(), rs_.stddev(), df, p1, p2, p3);
double distance_skew_normal = get_quartile_distance(
p1, quantile(*skew_normal_, 0.25), p2, quantile(*skew_normal_, 0.5),
p3, quantile(*skew_normal_, 0.75));
double distance_normal = get_quartile_distance(
p1, quantile(*normal_, 0.25), p2, quantile(*normal_, 0.5), p3,
quantile(*normal_, 0.75));
// 用给定假设分布的数据
if (distance_normal <= distance_student &&
distance_normal <= distance_skew_normal) {
this->dist_type_ = DistTypes::normal;
logger_->Info() << "数据判定为正态分布!" << endl;
} else if (distance_student <= distance_normal &&
distance_student <= distance_skew_normal) {
this->dist_type_ = DistTypes::student_t;
logger_->Info() << "数据判定为T分布" << endl;
} else if (distance_skew_normal <= distance_normal &&
distance_skew_normal <= distance_student) {
this->dist_type_ = DistTypes::skew_normal;
logger_->Info() << "数据判定为偏态分布!" << endl;
}
} else {
logger_->Info() << "偏态系数小于等于0.1,数据判定为正态分布!" << endl;
this->dist_type_ = DistTypes::normal;
}
} else {
// 如果验证数据为空,则暂时认为是正态分布
this->dist_type_ = DistTypes::normal;
normal_ = std::make_shared<boost::math::normal>(rs_.mean(), rs_.stddev());
logger_->Info() << "验证数据为空,数据默认判断正态分布!" << endl;
}
this->prob_ = predefied_prob_;
// 根据预设置信区间和输入数据,对输入数据进行验证,动态扩大置信区间
auto error_rate = get_error_rate_type_1(prob_, tmp_data);
logger_->Debug() << "预设置信度:" << prob_ << ",首次第一类错误率:"
<< error_rate << endl;
while (error_rate > 0.3) {
if (prob_ < 0.99) {
prob_ += 0.01;
error_rate = get_error_rate_type_1(prob_, tmp_data);
} else {
break;
}
}
// 得到置信范围
this->legal_range_ = get_range(prob_);
logger_->Debug() << "最终置信度:" << prob_ << ",第一类错误率:"
<< error_rate << ",置信区间:["
<< this->legal_range_.get_left() << ","
<< this->legal_range_.get_right() << "]" << endl;
} catch (const std::exception& e) {
logger_->Error() << "Dist::auto_test 异常!" << e.what()
<< ",location:" << BOOST_CURRENT_LOCATION << endl;
this->valid_ = false;
return -1;
}
return 0;
}
// 设置报警数据信息,如果存在确切的报警数据,该数据可以帮助我们更好地判断总体的数据类型
// int Dist::set_warning_sample_optional(
// const std::vector<double>& warning_sample) {
// this->warning_sample_ = warning_sample_;
// return 0;
// }
// 得到第一类错误的错误率(如果数据正确而被判断出错)
double Dist::get_error_rate_type_1(const double& prob,
const std::vector<double>& tmp_data) {
if (tmp_data.empty()) {
return 0;
}
auto range = this->get_range(prob);
size_t lt_left = 0;
size_t gt_right = 0;
for (auto x : tmp_data) {
if (range.get_left() <= x + 0.1) {
break;
}
lt_left++;
}
for (auto r_it = tmp_data.rbegin(); r_it != tmp_data.rend(); r_it++) {
if (range.get_right() >= (*r_it) - 0.1) {
break;
}
gt_right++;
}
logger_->Debug() << "lt_left:" << lt_left << ",gt_right:" << gt_right
<< ",tmp_data.size():" << tmp_data.size() << endl;
return double((lt_left + gt_right)) /
double(tmp_data.size()); ///<小于1的小数
}
bool Dist::valid() const { return valid_; }
mix_cc::float_range_t Dist::get_range() const { return this->legal_range_; }
/**
* @brief
* @param prob
* @return mix_cc::float_range_t
*/
mix_cc::float_range_t Dist::get_range(double prob) {
double dest_prob = prob / 2 + 0.5;
try {
if (this->dist_type_ == DistTypes::normal) {
return mix_cc::float_range_t{quantile(*normal_, 1 - dest_prob),
quantile(*normal_, dest_prob)};
}
} catch (const std::exception& e) {
std::throw_with_nested(mix_cc::Exception(
-1, "legal range get error, distribution type: normal",
BOOST_CURRENT_LOCATION));
}
try {
if (this->dist_type_ == DistTypes::skew_normal) {
return mix_cc::float_range_t{
quantile(*skew_normal_, 1 - dest_prob),
quantile(*skew_normal_, prob)}; // 2021-10-28 偏态 正态分布
}
} catch (const std::exception& e) {
std::throw_with_nested(mix_cc::Exception(
-1, "legal range get error, distribution type: skew-normal",
BOOST_CURRENT_LOCATION));
}
try {
if (this->dist_type_ == DistTypes::student_t) {
return mix_cc::float_range_t{rs_.mean() - quantile(*students_t_, prob),
rs_.mean() + quantile(*students_t_, prob)};
}
} catch (const std::exception& e) {
std::throw_with_nested(mix_cc::Exception(
-1, "legal range get error, distribution type: student",
BOOST_CURRENT_LOCATION));
}
return mix_cc::float_range_t{0, 0};
}
} // namespace DAA