233 lines
8.7 KiB
C++
233 lines
8.7 KiB
C++
|
|
#include <eqpalg/feature_extraction/distribution.h>
|
|||
|
|
#include <mix_cc/exception.h>
|
|||
|
|
|
|||
|
|
namespace DAA {
|
|||
|
|
|
|||
|
|
using namespace boost::math;
|
|||
|
|
|
|||
|
|
Dist::Dist() { logger_ = std::make_unique<LOG>("DAA::Dist"); }
|
|||
|
|
|
|||
|
|
Dist::~Dist() {}
|
|||
|
|
|
|||
|
|
// 得到四分点的距离信息
|
|||
|
|
// 解压数据的四分位Q 与分布模型的四分位q 的距离
|
|||
|
|
// dist = |Q1-q1|+ |Q2-q2|+ |Q3-q3|
|
|||
|
|
double get_quartile_distance(double p11, double p12, double p21, double p22,
|
|||
|
|
double p31, double p32) {
|
|||
|
|
return sqrt(pow(p11 - p12, 2) + pow(p21 - p22, 2) + pow(p31 - p32, 2));
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 得到T分布的四分点的距离信息
|
|||
|
|
double get_st_quartile_distance(boost::math::students_t dist, double mean,
|
|||
|
|
double stddev, double df, double p1, double p2,
|
|||
|
|
double p3) {
|
|||
|
|
double t1 = quantile(dist, 0.75);
|
|||
|
|
double w1 = t1 * stddev / sqrt(df + 1);
|
|||
|
|
return sqrt(pow(mean - w1 - p1, 2) + pow(mean - p2, 2) +
|
|||
|
|
pow(mean + w1 - p1, 2));
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
Dist::DistTypes Dist::get_distribution_type() const { return this->dist_type_; }
|
|||
|
|
|
|||
|
|
int Dist::set_predefined_prob(double prob) {
|
|||
|
|
this->predefied_prob_ = prob;
|
|||
|
|
return 0;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
double Dist::get_shifted_prob() const { return prob_; }
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* @brief 检测分布
|
|||
|
|
* @param rs 统计特征值
|
|||
|
|
* @param tmp_data !从小到大排列的分布数据
|
|||
|
|
* @return int
|
|||
|
|
*/
|
|||
|
|
int Dist::auto_test(dlib::running_stats<double> rs,
|
|||
|
|
const std::vector<double>& tmp_data) {
|
|||
|
|
try {
|
|||
|
|
this->rs_ = rs;
|
|||
|
|
std::tuple<double, std::string, int, std::vector<int>> test;
|
|||
|
|
// 如果数据重要特征为0或者数据量太小,则认为是无效数据
|
|||
|
|
// 如果数据标准差过大,则也认为是无效数据
|
|||
|
|
if ((this->rs_.min() == 0 && this->rs_.max() == 0) ||
|
|||
|
|
(this->rs_.stddev() == 0 || this->rs_.current_n() < 20) ||
|
|||
|
|
(rs_.stddev() / rs_.mean() > 3)) {
|
|||
|
|
this->valid_ = false;
|
|||
|
|
this->dist_type_ = DistTypes::unknown;
|
|||
|
|
logger_->Error()
|
|||
|
|
<< "样本无效!数据重要特征为0,或者数据量太小,数据标准差过大!"
|
|||
|
|
<< "/n min:" << this->rs_.min() << ",max:" << this->rs_.max()
|
|||
|
|
<< ",stddev:" << this->rs_.stddev()
|
|||
|
|
<< ",current:" << this->rs_.current_n()
|
|||
|
|
<< ",mean:" << this->rs_.mean() << endl;
|
|||
|
|
return -1;
|
|||
|
|
} else {
|
|||
|
|
this->valid_ = true;
|
|||
|
|
}
|
|||
|
|
// 如果验证数据非空,则对验证数据进行测试,找到最相似的分布
|
|||
|
|
//无效数据也不进行计算
|
|||
|
|
if (!tmp_data.empty() && this->valid_) {
|
|||
|
|
double p1, p2, p3;
|
|||
|
|
auto tmp_data_size = tmp_data.size();
|
|||
|
|
// 得到四分点的数据
|
|||
|
|
p1 = tmp_data[tmp_data_size * 1 / 4];
|
|||
|
|
p2 = tmp_data[tmp_data_size * 2 / 4];
|
|||
|
|
p3 = tmp_data[tmp_data_size * 3 / 4];
|
|||
|
|
normal_ = std::make_shared<boost::math::normal>(rs_.mean(), rs_.stddev());
|
|||
|
|
if (rs_.skewness() > 0.1) {
|
|||
|
|
// 计算自由的
|
|||
|
|
double df = boost::math::students_t::find_degrees_of_freedom(
|
|||
|
|
rs_.skewness(), 0.05, 0.05, rs_.stddev());
|
|||
|
|
// 根据自由的构造 t分布
|
|||
|
|
students_t_ = std::make_shared<boost::math::students_t>(df);
|
|||
|
|
// 构造偏态分布
|
|||
|
|
skew_normal_ = std::make_shared<boost::math::skew_normal>(
|
|||
|
|
rs_.mean(), rs_.stddev(), rs_.skewness());
|
|||
|
|
|
|||
|
|
double distance_student = get_st_quartile_distance(
|
|||
|
|
*students_t_, rs_.mean(), rs_.stddev(), df, p1, p2, p3);
|
|||
|
|
|
|||
|
|
double distance_skew_normal = get_quartile_distance(
|
|||
|
|
p1, quantile(*skew_normal_, 0.25), p2, quantile(*skew_normal_, 0.5),
|
|||
|
|
p3, quantile(*skew_normal_, 0.75));
|
|||
|
|
|
|||
|
|
double distance_normal = get_quartile_distance(
|
|||
|
|
p1, quantile(*normal_, 0.25), p2, quantile(*normal_, 0.5), p3,
|
|||
|
|
quantile(*normal_, 0.75));
|
|||
|
|
// 用给定假设分布的数据
|
|||
|
|
if (distance_normal <= distance_student &&
|
|||
|
|
distance_normal <= distance_skew_normal) {
|
|||
|
|
this->dist_type_ = DistTypes::normal;
|
|||
|
|
logger_->Info() << "数据判定为正态分布!" << endl;
|
|||
|
|
} else if (distance_student <= distance_normal &&
|
|||
|
|
distance_student <= distance_skew_normal) {
|
|||
|
|
this->dist_type_ = DistTypes::student_t;
|
|||
|
|
logger_->Info() << "数据判定为T分布!" << endl;
|
|||
|
|
} else if (distance_skew_normal <= distance_normal &&
|
|||
|
|
distance_skew_normal <= distance_student) {
|
|||
|
|
this->dist_type_ = DistTypes::skew_normal;
|
|||
|
|
logger_->Info() << "数据判定为偏态分布!" << endl;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
} else {
|
|||
|
|
logger_->Info() << "偏态系数小于等于0.1,数据判定为正态分布!" << endl;
|
|||
|
|
this->dist_type_ = DistTypes::normal;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
} else {
|
|||
|
|
// 如果验证数据为空,则暂时认为是正态分布
|
|||
|
|
this->dist_type_ = DistTypes::normal;
|
|||
|
|
normal_ = std::make_shared<boost::math::normal>(rs_.mean(), rs_.stddev());
|
|||
|
|
logger_->Info() << "验证数据为空,数据默认判断正态分布!" << endl;
|
|||
|
|
}
|
|||
|
|
this->prob_ = predefied_prob_;
|
|||
|
|
|
|||
|
|
// 根据预设置信区间和输入数据,对输入数据进行验证,动态扩大置信区间
|
|||
|
|
auto error_rate = get_error_rate_type_1(prob_, tmp_data);
|
|||
|
|
logger_->Debug() << "预设置信度:" << prob_ << ",首次第一类错误率:"
|
|||
|
|
<< error_rate << endl;
|
|||
|
|
while (error_rate > 0.3) {
|
|||
|
|
if (prob_ < 0.99) {
|
|||
|
|
prob_ += 0.01;
|
|||
|
|
error_rate = get_error_rate_type_1(prob_, tmp_data);
|
|||
|
|
} else {
|
|||
|
|
break;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
// 得到置信范围
|
|||
|
|
this->legal_range_ = get_range(prob_);
|
|||
|
|
logger_->Debug() << "最终置信度:" << prob_ << ",第一类错误率:"
|
|||
|
|
<< error_rate << ",置信区间:["
|
|||
|
|
<< this->legal_range_.get_left() << ","
|
|||
|
|
<< this->legal_range_.get_right() << "]" << endl;
|
|||
|
|
} catch (const std::exception& e) {
|
|||
|
|
logger_->Error() << "Dist::auto_test 异常!" << e.what()
|
|||
|
|
<< ",location:" << BOOST_CURRENT_LOCATION << endl;
|
|||
|
|
this->valid_ = false;
|
|||
|
|
return -1;
|
|||
|
|
}
|
|||
|
|
return 0;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 设置报警数据信息,如果存在确切的报警数据,该数据可以帮助我们更好地判断总体的数据类型
|
|||
|
|
// int Dist::set_warning_sample_optional(
|
|||
|
|
// const std::vector<double>& warning_sample) {
|
|||
|
|
// this->warning_sample_ = warning_sample_;
|
|||
|
|
// return 0;
|
|||
|
|
// }
|
|||
|
|
|
|||
|
|
// 得到第一类错误的错误率(如果数据正确而被判断出错)
|
|||
|
|
double Dist::get_error_rate_type_1(const double& prob,
|
|||
|
|
const std::vector<double>& tmp_data) {
|
|||
|
|
if (tmp_data.empty()) {
|
|||
|
|
return 0;
|
|||
|
|
}
|
|||
|
|
auto range = this->get_range(prob);
|
|||
|
|
size_t lt_left = 0;
|
|||
|
|
size_t gt_right = 0;
|
|||
|
|
for (auto x : tmp_data) {
|
|||
|
|
if (range.get_left() <= x + 0.1) {
|
|||
|
|
break;
|
|||
|
|
}
|
|||
|
|
lt_left++;
|
|||
|
|
}
|
|||
|
|
for (auto r_it = tmp_data.rbegin(); r_it != tmp_data.rend(); r_it++) {
|
|||
|
|
if (range.get_right() >= (*r_it) - 0.1) {
|
|||
|
|
break;
|
|||
|
|
}
|
|||
|
|
gt_right++;
|
|||
|
|
}
|
|||
|
|
logger_->Debug() << "lt_left:" << lt_left << ",gt_right:" << gt_right
|
|||
|
|
<< ",tmp_data.size():" << tmp_data.size() << endl;
|
|||
|
|
return double((lt_left + gt_right)) /
|
|||
|
|
double(tmp_data.size()); ///<小于1的小数
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
bool Dist::valid() const { return valid_; }
|
|||
|
|
|
|||
|
|
mix_cc::float_range_t Dist::get_range() const { return this->legal_range_; }
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* @brief 计算置信区间
|
|||
|
|
* @param prob 置信度
|
|||
|
|
* @return mix_cc::float_range_t
|
|||
|
|
*/
|
|||
|
|
mix_cc::float_range_t Dist::get_range(double prob) {
|
|||
|
|
double dest_prob = prob / 2 + 0.5;
|
|||
|
|
try {
|
|||
|
|
if (this->dist_type_ == DistTypes::normal) {
|
|||
|
|
return mix_cc::float_range_t{quantile(*normal_, 1 - dest_prob),
|
|||
|
|
quantile(*normal_, dest_prob)};
|
|||
|
|
}
|
|||
|
|
} catch (const std::exception& e) {
|
|||
|
|
std::throw_with_nested(mix_cc::Exception(
|
|||
|
|
-1, "legal range get error, distribution type: normal",
|
|||
|
|
BOOST_CURRENT_LOCATION));
|
|||
|
|
}
|
|||
|
|
try {
|
|||
|
|
if (this->dist_type_ == DistTypes::skew_normal) {
|
|||
|
|
return mix_cc::float_range_t{
|
|||
|
|
quantile(*skew_normal_, 1 - dest_prob),
|
|||
|
|
quantile(*skew_normal_, prob)}; // 2021-10-28 偏态 正态分布
|
|||
|
|
}
|
|||
|
|
} catch (const std::exception& e) {
|
|||
|
|
std::throw_with_nested(mix_cc::Exception(
|
|||
|
|
-1, "legal range get error, distribution type: skew-normal",
|
|||
|
|
BOOST_CURRENT_LOCATION));
|
|||
|
|
}
|
|||
|
|
try {
|
|||
|
|
if (this->dist_type_ == DistTypes::student_t) {
|
|||
|
|
return mix_cc::float_range_t{rs_.mean() - quantile(*students_t_, prob),
|
|||
|
|
rs_.mean() + quantile(*students_t_, prob)};
|
|||
|
|
}
|
|||
|
|
} catch (const std::exception& e) {
|
|||
|
|
std::throw_with_nested(mix_cc::Exception(
|
|||
|
|
-1, "legal range get error, distribution type: student",
|
|||
|
|
BOOST_CURRENT_LOCATION));
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return mix_cc::float_range_t{0, 0};
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
} // namespace DAA
|