226 lines
8.3 KiB
C++
226 lines
8.3 KiB
C++
#include <eqpalg/feature_extraction/distribution.h>
|
||
#include <mix_cc/exception.h>
|
||
|
||
namespace DAA {
|
||
|
||
using namespace boost::math;
|
||
|
||
Dist::Dist() { logger_ = std::make_unique<LOG>("DAA::Dist"); }
|
||
|
||
Dist::~Dist() {}
|
||
|
||
// 得到四分点的距离信息
|
||
// 解压数据的四分位Q 与分布模型的四分位q 的距离
|
||
// dist = |Q1-q1|+ |Q2-q2|+ |Q3-q3|
|
||
double get_quartile_distance(double p11, double p12, double p21, double p22,
|
||
double p31, double p32) {
|
||
return sqrt(pow(p11 - p12, 2) + pow(p21 - p22, 2) + pow(p31 - p32, 2));
|
||
}
|
||
|
||
// 得到T分布的四分点的距离信息
|
||
double get_st_quartile_distance(boost::math::students_t dist, double mean,
|
||
double stddev, double df, double p1, double p2,
|
||
double p3) {
|
||
double t1 = quantile(dist, 0.75);
|
||
double w1 = t1 * stddev / sqrt(df + 1);
|
||
return sqrt(pow(mean - w1 - p1, 2) + pow(mean - p2, 2) +
|
||
pow(mean + w1 - p1, 2));
|
||
}
|
||
|
||
Dist::DistTypes Dist::get_distribution_type() const { return this->dist_type_; }
|
||
|
||
int Dist::set_predefined_prob(double prob) {
|
||
this->predefied_prob_ = prob;
|
||
return 0;
|
||
}
|
||
|
||
double Dist::get_shifted_prob() const { return prob_; }
|
||
|
||
/**
|
||
* @brief 检测分布
|
||
* @param rs 统计特征值
|
||
* @param tmp_data !从小到大排列的分布数据
|
||
* @return int
|
||
*/
|
||
int Dist::auto_test(dlib::running_stats<double> rs,
|
||
const std::vector<double>& tmp_data) {
|
||
try {
|
||
this->rs_ = rs;
|
||
std::tuple<double, std::string, int, std::vector<int>> test;
|
||
// 如果数据重要特征为0或者数据量太小,则认为是无效数据
|
||
// 如果数据标准差过大,则也认为是无效数据
|
||
if ((this->rs_.min() == 0 && this->rs_.max() == 0) ||
|
||
(this->rs_.stddev() == 0 || this->rs_.current_n() < 20) ||
|
||
(rs_.stddev() / rs_.mean() > 3)) {
|
||
this->valid_ = false;
|
||
this->dist_type_ = DistTypes::unknown;
|
||
logger_->Error()
|
||
<< "样本无效!数据重要特征为0,或者数据量太小,数据标准差过大!"
|
||
<< "/n min:" << this->rs_.min() << ",max:" << this->rs_.max()
|
||
<< ",stddev:" << this->rs_.stddev()
|
||
<< ",current:" << this->rs_.current_n()
|
||
<< ",mean:" << this->rs_.mean() << endl;
|
||
return -1;
|
||
} else {
|
||
this->valid_ = true;
|
||
}
|
||
// 如果验证数据非空,则对验证数据进行测试,找到最相似的分布
|
||
//无效数据也不进行计算
|
||
if (!tmp_data.empty() && this->valid_) {
|
||
double p1, p2, p3;
|
||
auto tmp_data_size = tmp_data.size();
|
||
// 得到四分点的数据
|
||
p1 = tmp_data[tmp_data_size * 1 / 4];
|
||
p2 = tmp_data[tmp_data_size * 2 / 4];
|
||
p3 = tmp_data[tmp_data_size * 3 / 4];
|
||
normal_ = std::make_shared<boost::math::normal>(rs_.mean(), rs_.stddev());
|
||
if (rs_.skewness() > 0.1) {
|
||
// 计算自由的
|
||
double df = boost::math::students_t::find_degrees_of_freedom(
|
||
rs_.skewness(), 0.05, 0.05, rs_.stddev());
|
||
// 根据自由的构造 t分布
|
||
students_t_ = std::make_shared<boost::math::students_t>(df);
|
||
// 构造偏态分布
|
||
skew_normal_ = std::make_shared<boost::math::skew_normal>(
|
||
rs_.mean(), rs_.stddev(), rs_.skewness());
|
||
|
||
double distance_student = get_st_quartile_distance(
|
||
*students_t_, rs_.mean(), rs_.stddev(), df, p1, p2, p3);
|
||
|
||
double distance_skew_normal = get_quartile_distance(
|
||
p1, quantile(*skew_normal_, 0.25), p2, quantile(*skew_normal_, 0.5),
|
||
p3, quantile(*skew_normal_, 0.75));
|
||
|
||
double distance_normal = get_quartile_distance(
|
||
p1, quantile(*normal_, 0.25), p2, quantile(*normal_, 0.5), p3,
|
||
quantile(*normal_, 0.75));
|
||
// 用给定假设分布的数据
|
||
if (distance_normal <= distance_student &&
|
||
distance_normal <= distance_skew_normal) {
|
||
this->dist_type_ = DistTypes::normal;
|
||
logger_->Info() << "数据判定为正态分布!" << endl;
|
||
} else if (distance_student <= distance_normal &&
|
||
distance_student <= distance_skew_normal) {
|
||
this->dist_type_ = DistTypes::student_t;
|
||
logger_->Info() << "数据判定为T分布!" << endl;
|
||
} else if (distance_skew_normal <= distance_normal &&
|
||
distance_skew_normal <= distance_student) {
|
||
this->dist_type_ = DistTypes::skew_normal;
|
||
logger_->Info() << "数据判定为偏态分布!" << endl;
|
||
}
|
||
|
||
} else {
|
||
logger_->Info() << "偏态系数小于等于0.1,数据判定为正态分布!" << endl;
|
||
this->dist_type_ = DistTypes::normal;
|
||
}
|
||
|
||
} else {
|
||
// 如果验证数据为空,则暂时认为是正态分布
|
||
this->dist_type_ = DistTypes::normal;
|
||
normal_ = std::make_shared<boost::math::normal>(rs_.mean(), rs_.stddev());
|
||
logger_->Info() << "验证数据为空,数据默认判断正态分布!" << endl;
|
||
}
|
||
this->prob_ = predefied_prob_;
|
||
|
||
// 根据预设置信区间和输入数据,对输入数据进行验证,动态扩大置信区间
|
||
auto error_rate = get_error_rate_type_1(prob_, tmp_data);
|
||
logger_->Debug() << "预设置信度:" << prob_ << ",首次第一类错误率:"
|
||
<< error_rate << endl;
|
||
while (error_rate > 0.3) {
|
||
if (prob_ < 0.99) {
|
||
prob_ += 0.01;
|
||
error_rate = get_error_rate_type_1(prob_, tmp_data);
|
||
} else {
|
||
break;
|
||
}
|
||
}
|
||
// 得到置信范围
|
||
this->legal_range_ = get_range(prob_);
|
||
logger_->Debug() << "最终置信度:" << prob_ << ",第一类错误率:"
|
||
<< error_rate << ",置信区间:["
|
||
<< this->legal_range_.get_left() << ","
|
||
<< this->legal_range_.get_right() << "]" << endl;
|
||
} catch (const std::exception& e) {
|
||
logger_->Error() << "Dist::auto_test 异常!" << e.what()
|
||
<< ",location:" << BOOST_CURRENT_LOCATION << endl;
|
||
this->valid_ = false;
|
||
return -1;
|
||
}
|
||
return 0;
|
||
}
|
||
|
||
// 得到第一类错误的错误率(如果数据正确而被判断出错)
|
||
double Dist::get_error_rate_type_1(const double& prob,
|
||
const std::vector<double>& tmp_data) {
|
||
if (tmp_data.empty()) {
|
||
return 0;
|
||
}
|
||
auto range = this->get_range(prob);
|
||
size_t lt_left = 0;
|
||
size_t gt_right = 0;
|
||
for (auto x : tmp_data) {
|
||
if (range.get_left() <= x + 0.1) {
|
||
break;
|
||
}
|
||
lt_left++;
|
||
}
|
||
for (auto r_it = tmp_data.rbegin(); r_it != tmp_data.rend(); r_it++) {
|
||
if (range.get_right() >= (*r_it) - 0.1) {
|
||
break;
|
||
}
|
||
gt_right++;
|
||
}
|
||
logger_->Debug() << "lt_left:" << lt_left << ",gt_right:" << gt_right
|
||
<< ",tmp_data.size():" << tmp_data.size() << endl;
|
||
return double((lt_left + gt_right)) /
|
||
double(tmp_data.size());
|
||
}
|
||
|
||
bool Dist::valid() const { return valid_; }
|
||
|
||
mix_cc::float_range_t Dist::get_range() const { return this->legal_range_; }
|
||
|
||
/**
|
||
* @brief 计算置信区间
|
||
* @param prob 置信度
|
||
* @return mix_cc::float_range_t
|
||
*/
|
||
mix_cc::float_range_t Dist::get_range(double prob) {
|
||
double dest_prob = prob / 2 + 0.5;
|
||
try {
|
||
if (this->dist_type_ == DistTypes::normal) {
|
||
return mix_cc::float_range_t{quantile(*normal_, 1 - dest_prob),
|
||
quantile(*normal_, dest_prob)};
|
||
}
|
||
} catch (const std::exception& e) {
|
||
std::throw_with_nested(mix_cc::Exception(
|
||
-1, "legal range get error, distribution type: normal",
|
||
BOOST_CURRENT_LOCATION));
|
||
}
|
||
try {
|
||
if (this->dist_type_ == DistTypes::skew_normal) {
|
||
return mix_cc::float_range_t{
|
||
quantile(*skew_normal_, 1 - dest_prob),
|
||
quantile(*skew_normal_, prob)}; // 2021-10-28 偏态 正态分布
|
||
}
|
||
} catch (const std::exception& e) {
|
||
std::throw_with_nested(mix_cc::Exception(
|
||
-1, "legal range get error, distribution type: skew-normal",
|
||
BOOST_CURRENT_LOCATION));
|
||
}
|
||
try {
|
||
if (this->dist_type_ == DistTypes::student_t) {
|
||
return mix_cc::float_range_t{rs_.mean() - quantile(*students_t_, prob),
|
||
rs_.mean() + quantile(*students_t_, prob)};
|
||
}
|
||
} catch (const std::exception& e) {
|
||
std::throw_with_nested(mix_cc::Exception(
|
||
-1, "legal range get error, distribution type: student",
|
||
BOOST_CURRENT_LOCATION));
|
||
}
|
||
|
||
return mix_cc::float_range_t{0, 0};
|
||
}
|
||
|
||
}
|