eis/eqpalg/.do_not_use/data_handler/approximate_data.h

161 lines
4.6 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#pragma once
/**
* @file approximate_data.h
* @brief 存储大概分布的类
* @author Cat (null.null.null@qq.com)
* @version 0.1
* @date 2021-08-24
*
* Copyright: Baosight Co. Ltd.
* DO NOT COPY/USE WITHOUT PERMISSION
*
*/
#include <eqpalg/table_struct/t_rule_sample_1d.h>
#include <eqpalg/table_struct/t_rule_sample_2d.h>
#include <eqpalg/table_struct/t_rule_sample_3d.h>
#include <eqpalg/table_struct/t_rule_sample_1d_info.h>
#include <eqpalg/table_struct/t_rule_sample_2d_info.h>
#include <eqpalg/table_struct/t_rule_sample_3d_info.h>
#include <array>
#include <vector>
#include <type_traits>
#include <string>
#include <map>
#include <utility>
#include <random>
#include "mix_cc/type/range.h"
#include "mix_cc/sql.h"
#include "mix_cc/sql/database/db2_t.h"
#include <eqpalg/data_handler/base.h>
#include <eqpalg/gb_logger.h>
#include <eqpalg/define/dlib.h>
#include <eqpalg/define/sample.h>
namespace data_handler {
namespace policy {
using std::string;
using namespace mix_cc;
using namespace mix_cc::sql;
/**
* @brief 大约数据分布
*/
struct ApproximateData : public Base {
/**
* @brief 压缩后的单个数据点(不包含个数信息)
* @tparam n
*/
struct SamplePointWR {
small_vector<mix_cc::float_range_t, 3> value;
SamplePointWR(const SamplePoint& i_value, const SamplePoint& cell_range) {
for (size_t i = 0; i < i_value.size(); i++) {
value.push_back(mix_cc::make_range_t<double>(
(static_cast<int>(i_value[i] / cell_range[i])) * cell_range[i],
cell_range[i])); // range -> left = i_value-cell_range
// right= i_value+cell_range
}
}
/**
* @brief 对数据点进行比较用于map的使用
* @param rhs 比较值
* @return true 被比较值的所有维度数据大小均小于比较值
* @return false 被比较值存在维度数据大于等于比较值
*/
bool operator<(const SamplePointWR& rhs) const {
bool result = true;
for (size_t i = 0; i < this->value.size(); i++) {
result = result && (this->value[i] < rhs.value[i]);
}
return result;
}
};
using Data = std::map<SamplePointWR, size_t>; ///< 压缩之后的数据类型
using Dim1Table = T_RULE_SAMPLE_1D;
using Dim2Table = T_RULE_SAMPLE_2D;
using Dim3Table = T_RULE_SAMPLE_3D;
using InData = SampleWindow;
using DumpedMetaData = std::pair<SamplePoint, size_t>; ///< 导出的元数据类型
using DumpedData = std::vector<DumpedMetaData>; ///< 导出的数据类型
using OutData = InData;
protected:
Data data_; ///< 分布信息数据
Data insert_list_; ///< 要插入的数据
Data update_list_; ///< 要更新的数据
Rs rs_; ///< 不同维度特征值信息数据
small_vector<double, 3>
c_r_; ///< 不同维度的数据单元范围大小(同一维度单元大小一致)
double scale_; ///< 数据缩放大小
size_t dump_size_; ///< 解压缩之后的数据量大小
bool is_first_sampling_ = true;
static constexpr size_t k_dest_dump_size =
10000; ///< 目标的解压缩之后的数据量大小
const std::unique_ptr<GbLogger> gb_logger_;
public:
ApproximateData(const std::string& ruleId, size_t dims);
int first_sampling_batch(const InData& first_runing_info, TimePoint tp,Rs running_state);
/**
* @brief
* 是否是第一次采样
* is_first_sampling_对mon的作用是保证能取到db2的信息分布和本地的数据特征值
* is_first_sampling_对cron的作用是保证仅第一次存入本地特征和db2信息分布
* is_first_sampling_在mon进程中仅在load数据时dump_size_ != 0才被置为false
* is_first_sampling_在cron进程中仅在falsefirst_sampling_batch成功时才被置为false
* @return true
* @return false
*/
bool is_first_sampling() { return this->is_first_sampling_; }
int load();
int store(const SamplePoint& i_value);
int commit();
/**
* @brief 获得统计数据的大概分布信息
* @return RunningStats
*/
Rs get_running_stats() { return this->rs_; }
// 均值方差 2021-11-29
int put_data_to_rs(const SampleWindow& input_data);
vector<double> get_rs_means();
vector<double> get_rs_variances();
vector<double> get_rs_stddev();
vector<double> get_rs_skewness();
vector<double> get_rs_kurtosis();
vector<double> get_rs_max();
vector<double> get_rs_min();
OutData extract();
/**
* @brief 获取采样数据的大小
* @return size_t
*/
size_t get_sampling_size() { return this->sampling_size_; }
};
} // namespace policy
} // namespace data_handler