eis/eqpalg/feature_extraction/distribution.cc

#include <eqpalg/feature_extraction/distribution.h>
#include <mix_cc/exception.h>

namespace DAA {

using namespace boost::math;

Dist::Dist() { logger_ = std::make_unique<LOG>("DAA::Dist"); }

Dist::~Dist() {}

// 得到四分点的距离信息
// 解压数据的四分位Q 与分布模型的四分位q 的距离
// dist  = |Q1-q1|+ |Q2-q2|+ |Q3-q3|
double get_quartile_distance(double p11, double p12, double p21, double p22,
                             double p31, double p32) {
  return sqrt(pow(p11 - p12, 2) + pow(p21 - p22, 2) + pow(p31 - p32, 2));
}

// 得到T分布的四分点的距离信息
double get_st_quartile_distance(boost::math::students_t dist, double mean,
                                double stddev, double df, double p1, double p2,
                                double p3) {
  double t1 = quantile(dist, 0.75);
  double w1 = t1 * stddev / sqrt(df + 1);
  return sqrt(pow(mean - w1 - p1, 2) + pow(mean - p2, 2) +
              pow(mean + w1 - p1, 2));
}

Dist::DistTypes Dist::get_distribution_type() const { return this->dist_type_; }

int Dist::set_predefined_prob(double prob) {
  this->predefied_prob_ = prob;
  return 0;
}

double Dist::get_shifted_prob() const { return prob_; }

/**
 * @brief 检测分布
 * @param  rs               统计特征值
 * @param  tmp_data         ！从小到大排列的分布数据
 * @return int
 */
int Dist::auto_test(dlib::running_stats<double> rs,
                    const std::vector<double>& tmp_data) {
  try {
    this->rs_ = rs;
    std::tuple<double, std::string, int, std::vector<int>> test;
    // 如果数据重要特征为0或者数据量太小，则认为是无效数据
    // 如果数据标准差过大，则也认为是无效数据
    if ((this->rs_.min() == 0 && this->rs_.max() == 0) ||
        (this->rs_.stddev() == 0 || this->rs_.current_n() < 20) ||
        (rs_.stddev() / rs_.mean() > 3)) {
      this->valid_ = false;
      this->dist_type_ = DistTypes::unknown;
      logger_->Error()
          << "样本无效！数据重要特征为0，或者数据量太小，数据标准差过大！"
          << "/n min:" << this->rs_.min() << ",max:" << this->rs_.max()
          << ",stddev:" << this->rs_.stddev()
          << ",current:" << this->rs_.current_n()
          << ",mean:" << this->rs_.mean() << endl;
      return -1;
    } else {
      this->valid_ = true;
    }
    // 如果验证数据非空，则对验证数据进行测试，找到最相似的分布
    //无效数据也不进行计算
    if (!tmp_data.empty() && this->valid_) {
      double p1, p2, p3;
      auto tmp_data_size = tmp_data.size();
      // 得到四分点的数据
      p1 = tmp_data[tmp_data_size * 1 / 4];
      p2 = tmp_data[tmp_data_size * 2 / 4];
      p3 = tmp_data[tmp_data_size * 3 / 4];
      normal_ = std::make_shared<boost::math::normal>(rs_.mean(), rs_.stddev());
      if (rs_.skewness() > 0.1) {
        // 计算自由的
        double df = boost::math::students_t::find_degrees_of_freedom(
            rs_.skewness(), 0.05, 0.05, rs_.stddev());
        // 根据自由的构造 t分布
        students_t_ = std::make_shared<boost::math::students_t>(df);
        // 构造偏态分布
        skew_normal_ = std::make_shared<boost::math::skew_normal>(
            rs_.mean(), rs_.stddev(), rs_.skewness());

        double distance_student = get_st_quartile_distance(
            *students_t_, rs_.mean(), rs_.stddev(), df, p1, p2, p3);

        double distance_skew_normal = get_quartile_distance(
            p1, quantile(*skew_normal_, 0.25), p2, quantile(*skew_normal_, 0.5),
            p3, quantile(*skew_normal_, 0.75));

        double distance_normal = get_quartile_distance(
            p1, quantile(*normal_, 0.25), p2, quantile(*normal_, 0.5), p3,
            quantile(*normal_, 0.75));
        // 用给定假设分布的数据
        if (distance_normal <= distance_student &&
            distance_normal <= distance_skew_normal) {
          this->dist_type_ = DistTypes::normal;
          logger_->Info() << "数据判定为正态分布！" << endl;
        } else if (distance_student <= distance_normal &&
                   distance_student <= distance_skew_normal) {
          this->dist_type_ = DistTypes::student_t;
          logger_->Info() << "数据判定为T分布！" << endl;
        } else if (distance_skew_normal <= distance_normal &&
                   distance_skew_normal <= distance_student) {
          this->dist_type_ = DistTypes::skew_normal;
          logger_->Info() << "数据判定为偏态分布！" << endl;
        }

      } else {
        logger_->Info() << "偏态系数小于等于0.1，数据判定为正态分布！" << endl;
        this->dist_type_ = DistTypes::normal;
      }

    } else {
      // 如果验证数据为空，则暂时认为是正态分布
      this->dist_type_ = DistTypes::normal;
      normal_ = std::make_shared<boost::math::normal>(rs_.mean(), rs_.stddev());
      logger_->Info() << "验证数据为空，数据默认判断正态分布！" << endl;
    }
    this->prob_ = predefied_prob_;

    // 根据预设置信区间和输入数据，对输入数据进行验证，动态扩大置信区间
    auto error_rate = get_error_rate_type_1(prob_, tmp_data);
    logger_->Debug() << "预设置信度：" << prob_ << ",首次第一类错误率："
                     << error_rate << endl;
    while (error_rate > 0.3) {
      if (prob_ < 0.99) {
        prob_ += 0.01;
        error_rate = get_error_rate_type_1(prob_, tmp_data);
      } else {
        break;
      }
    }
    // 得到置信范围
    this->legal_range_ = get_range(prob_);
    logger_->Debug() << "最终置信度：" << prob_ << ",第一类错误率："
                     << error_rate << ",置信区间：["
                     << this->legal_range_.get_left() << ","
                     << this->legal_range_.get_right() << "]" << endl;
  } catch (const std::exception& e) {
    logger_->Error() << "Dist::auto_test 异常！" << e.what()
                     << ",location:" << BOOST_CURRENT_LOCATION << endl;
    this->valid_ = false;
    return -1;
  }
  return 0;
}

// 设置报警数据信息，如果存在确切的报警数据，该数据可以帮助我们更好地判断总体的数据类型
// int Dist::set_warning_sample_optional(
//     const std::vector<double>& warning_sample) {
//   this->warning_sample_ = warning_sample_;
//   return 0;
// }

// 得到第一类错误的错误率（如果数据正确而被判断出错）
double Dist::get_error_rate_type_1(const double& prob,
                                   const std::vector<double>& tmp_data) {
  if (tmp_data.empty()) {
    return 0;
  }
  auto range = this->get_range(prob);
  size_t lt_left = 0;
  size_t gt_right = 0;
  for (auto x : tmp_data) {
    if (range.get_left() <= x + 0.1) {
      break;
    }
    lt_left++;
  }
  for (auto r_it = tmp_data.rbegin(); r_it != tmp_data.rend(); r_it++) {
    if (range.get_right() >= (*r_it) - 0.1) {
      break;
    }
    gt_right++;
  }
  logger_->Debug() << "lt_left:" << lt_left << ",gt_right:" << gt_right
                   << ",tmp_data.size():" << tmp_data.size() << endl;
  return double((lt_left + gt_right)) /
         double(tmp_data.size());  ///<小于1的小数
}

bool Dist::valid() const { return valid_; }

mix_cc::float_range_t Dist::get_range() const { return this->legal_range_; }

/**
 * @brief 计算置信区间
 * @param  prob            置信度
 * @return mix_cc::float_range_t
 */
mix_cc::float_range_t Dist::get_range(double prob) {
  double dest_prob = prob / 2 + 0.5;
  try {
    if (this->dist_type_ == DistTypes::normal) {
      return mix_cc::float_range_t{quantile(*normal_, 1 - dest_prob),
                                   quantile(*normal_, dest_prob)};
    }
  } catch (const std::exception& e) {
    std::throw_with_nested(mix_cc::Exception(
        -1, "legal range get error, distribution type: normal",
        BOOST_CURRENT_LOCATION));
  }
  try {
    if (this->dist_type_ == DistTypes::skew_normal) {
      return mix_cc::float_range_t{
          quantile(*skew_normal_, 1 - dest_prob),
          quantile(*skew_normal_, prob)};  // 2021-10-28 偏态 正态分布
    }
  } catch (const std::exception& e) {
    std::throw_with_nested(mix_cc::Exception(
        -1, "legal range get error, distribution type: skew-normal",
        BOOST_CURRENT_LOCATION));
  }
  try {
    if (this->dist_type_ == DistTypes::student_t) {
      return mix_cc::float_range_t{rs_.mean() - quantile(*students_t_, prob),
                                   rs_.mean() + quantile(*students_t_, prob)};
    }
  } catch (const std::exception& e) {
    std::throw_with_nested(mix_cc::Exception(
        -1, "legal range get error, distribution type: student",
        BOOST_CURRENT_LOCATION));
  }

  return mix_cc::float_range_t{0, 0};
}

}  // namespace DAA