# -*- coding: utf-8 -*- """ Pandas数据文件读写工具模块 该模块提供基于pandas的数据文件读写功能,支持多种文件格式(CSV、Excel、JSON)。 主要类PandasDataIO封装了常用的数据读取和写入方法,并提供了灵活的参数配置选项。 Author: -Author : zoufuzhou -Date : 2025-05-21 16:34:37 -LastEditTime : 2025-05-21 16:34:37 """ import pandas as pd import os from typing import Union, Optional, Literal, Generator, Dict, List class PandasDataIO: """基于pandas的数据文件读写工具类(支持CSV/Excel/JSON)""" def __init__(self, index_col: Optional[Union[str, int]] = None, encoding: str = 'utf-8', na_values: Optional[list] = None): """ 初始化数据IO处理器 :param index_col: 用作行索引的列 :param encoding: 文件编码,默认为utf-8 :param na_values: 识别为NA/NaN的字符串列表 """ self.default_read_params = { 'index_col': index_col, 'encoding': encoding, 'na_values': na_values } self.default_write_params = { 'index': False, 'encoding': encoding } def read_csv(self, file_path: str, **kwargs) -> Union[pd.DataFrame, Generator[pd.DataFrame, None, None]]: """ 读取CSV文件(支持分块模式) :param file_path: CSV文件路径 :param kwargs: 传递给pandas.read_csv的额外参数 :return: 单个DataFrame或生成器(分块模式) """ try: params = {**self.default_read_params, **kwargs} if 'chunksize' in kwargs: return pd.read_csv(file_path, **params) return pd.read_csv(file_path, **params) except FileNotFoundError: raise FileNotFoundError(f"文件 {file_path} 不存在") except Exception as e: raise Exception(f"读取CSV文件时出错: {str(e)}") def read_csv_chunks(self, file_path: str, chunksize: int = 10000, **kwargs) -> Generator[pd.DataFrame, None, None]: """ 分块读取CSV文件 :param file_path: CSV文件路径 :param chunksize: 每块行数 :param kwargs: 传递给pandas.read_csv的额外参数 :return: DataFrame生成器 """ try: params = {**self.default_read_params, 'chunksize': chunksize, **kwargs} return pd.read_csv(file_path, **params) except FileNotFoundError: raise FileNotFoundError(f"文件 {file_path} 不存在") except Exception as e: raise Exception(f"分块读取CSV文件时出错: {str(e)}") def read_excel_chunks(self, file_path: str, chunksize: int = 10000, **kwargs) -> Generator[pd.DataFrame, None, None]: """ 分块读取Excel文件(模拟实现) :param file_path: Excel文件路径 :param chunksize: 每块行数 :param kwargs: 额外参数 :return: DataFrame生成器 """ try: # 读取整个文件 df = self.read_excel(file_path, **kwargs) # 模拟分块 for i in range(0, len(df), chunksize): yield df.iloc[i:i + chunksize] except Exception as e: raise Exception(f"分块读取Excel文件时出错: {str(e)}") def validate_data(self, df: pd.DataFrame, rules: Dict[str, List[str]] = None, check_na: bool = True) -> Dict[str, List[str]]: """ 数据验证 :param df: 要验证的DataFrame :param rules: 验证规则 {列名: [数据类型]} :param check_na: 是否检查缺失值 :return: 错误信息字典 {列名: [错误信息]} """ errors = {} # 检查缺失值 if check_na: na_cols = df.columns[df.isna().any()].tolist() if na_cols: errors['_na'] = [f"缺失值存在于列: {', '.join(na_cols)}"] # 检查数据类型 if rules: for col, types in rules.items(): if col in df.columns: col_type = str(df[col].dtype) if not any(t in col_type for t in types): if col not in errors: errors[col] = [] errors[col].append(f"数据类型不符: 期望{types}, 实际{col_type}") return errors def read_excel(self, file_path: str, sheet_name: Union[str, int, list, None] = 0, **kwargs) -> pd.DataFrame: """ 读取Excel文件 :param file_path: Excel文件路径 :param sheet_name: 工作表名称或索引,默认为0(第一个工作表) :param kwargs: 传递给pandas.read_excel的额外参数 :return: 包含Excel数据的DataFrame """ try: # 复制默认参数但排除encoding params = {k:v for k,v in self.default_read_params.items() if k != 'encoding'} params.update(kwargs) return pd.read_excel(file_path, sheet_name=sheet_name, **params) except FileNotFoundError: raise FileNotFoundError(f"文件 {file_path} 不存在") except Exception as e: raise Exception(f"读取Excel文件时出错: {str(e)}") def read_json(self, file_path: str, orient: str = 'records', **kwargs) -> pd.DataFrame: """ 读取JSON文件 :param file_path: JSON文件路径 :param orient: JSON格式取向('records', 'columns', 'index'等) :param kwargs: 传递给pandas.read_json的额外参数 :return: 包含JSON数据的DataFrame """ try: params = {'orient': orient, **kwargs} return pd.read_json(file_path, **params) except FileNotFoundError: raise FileNotFoundError(f"文件 {file_path} 不存在") except Exception as e: raise Exception(f"读取JSON文件时出错: {str(e)}") def read_file(self, file_path: str, file_type: Literal['auto', 'csv', 'excel', 'json'] = 'auto', **kwargs) -> pd.DataFrame: """ 通用文件读取方法,自动识别文件类型 :param file_path: 文件路径 :param file_type: 文件类型('auto'自动识别, 'csv', 'excel'或'json') :param kwargs: 传递给对应读取方法的额外参数 :return: 包含文件数据的DataFrame """ if file_type == 'auto': ext = os.path.splitext(file_path)[1].lower() if ext == '.csv': return self.read_csv(file_path, **kwargs) elif ext in ('.xls', '.xlsx', '.xlsm', '.xlsb'): return self.read_excel(file_path, **kwargs) elif ext == '.json': return self.read_json(file_path, **kwargs) else: raise ValueError(f"不支持的文件类型: {ext}") elif file_type == 'csv': return self.read_csv(file_path, **kwargs) elif file_type == 'excel': return self.read_excel(file_path, **kwargs) elif file_type == 'json': return self.read_json(file_path, **kwargs) else: raise ValueError(f"无效的文件类型参数: {file_type}") def write_json(self, file_path: str, data: Union[pd.DataFrame, dict, list], orient: str = 'records', **kwargs) -> None: """ 写入JSON文件 :param file_path: JSON文件路径 :param data: 要写入的数据,可以是DataFrame、字典或列表 :param orient: JSON格式取向('records', 'columns', 'index'等) :param kwargs: 传递给DataFrame.to_json的额外参数 """ try: if not isinstance(data, pd.DataFrame): data = pd.DataFrame(data) params = {'orient': orient, **kwargs} data.to_json(file_path, **params) except Exception as e: raise Exception(f"写入JSON文件时出错: {str(e)}") def write_excel(self, file_path: str, data: Union[pd.DataFrame, dict, list], sheet_name: str = 'Sheet1', **kwargs) -> None: """ 写入Excel文件 :param file_path: Excel文件路径 :param data: 要写入的数据,可以是DataFrame、字典或列表 :param sheet_name: 工作表名称,默认为'Sheet1' :param kwargs: 传递给DataFrame.to_excel的额外参数 """ try: if not isinstance(data, pd.DataFrame): data = pd.DataFrame(data) params = {'sheet_name': sheet_name, **kwargs} data.to_excel(file_path, **params) except Exception as e: raise Exception(f"写入Excel文件时出错: {str(e)}") def write_file(self, file_path: str, data: Union[pd.DataFrame, dict, list], file_type: Literal['auto', 'csv', 'excel', 'json'] = 'auto', **kwargs) -> None: """ 通用文件写入方法,自动识别文件类型 :param file_path: 文件路径 :param data: 要写入的数据 :param file_type: 文件类型('auto'自动识别, 'csv', 'excel'或'json') :param kwargs: 传递给对应写入方法的额外参数 """ if file_type == 'auto': ext = os.path.splitext(file_path)[1].lower() if ext == '.csv': self.write_csv(file_path, data, **kwargs) elif ext in ('.xls', '.xlsx', '.xlsm', '.xlsb'): self.write_excel(file_path, data, **kwargs) elif ext == '.json': self.write_json(file_path, data, **kwargs) else: raise ValueError(f"不支持的文件类型: {ext}") elif file_type == 'csv': self.write_csv(file_path, data, **kwargs) elif file_type == 'excel': self.write_excel(file_path, data, **kwargs) elif file_type == 'json': self.write_json(file_path, data, **kwargs) else: raise ValueError(f"无效的文件类型参数: {file_type}") def write_csv(self, file_path: str, data: Union[pd.DataFrame, dict, list], **kwargs) -> None: """ 写入CSV文件 :param file_path: CSV文件路径 :param data: 要写入的数据,可以是DataFrame、字典或列表 :param kwargs: 传递给DataFrame.to_csv的额外参数 """ try: if not isinstance(data, pd.DataFrame): data = pd.DataFrame(data) params = {**self.default_write_params, **kwargs} data.to_csv(file_path, **params) except Exception as e: raise Exception(f"写入CSV文件时出错: {str(e)}") # 使用示例 #from file.PathUtil import PathUtil #if __name__ == "__main__": # io = PandasDataIO() # # # 示例数据 # sample_data = [ # {"name": "Alice", "age": 25, "city": "New York"}, # {"name": "Bob", "age": 30, "city": "London"} # ] # # path = PathUtil().getEnv("HOME") + "/data/"; # # CSV操作 # print("\n=== CSV操作 ===") # io.write_file(path + "example.csv", sample_data) # 自动识别类型 # print("CSV读取结果:\n", io.read_file(path + "example.csv").head()) # # # Excel操作 # print("\n=== Excel操作 ===") # try: # # 测试写入和读取 # io.write_file(path + "example.xlsx", sample_data) # 自动识别类型 # excel_data = io.read_file("example.xlsx") # print("Excel读取结果:\n", excel_data.head()) # # # 测试带sheet名的写入和读取 # io.write_excel(path + "example_sheet.xlsx", sample_data, sheet_name="Employees") # sheet_data = io.read_excel("example_sheet.xlsx", sheet_name="Employees") # print("\n自定义sheet读取结果:\n", sheet_data.head()) # # # 测试参数传递 # print("\n带参数读取测试:") # param_data = io.read_excel(path + "example.xlsx", header=0, skiprows=0) # print(param_data.head()) # # except ImportError: # print("Excel功能需要安装openpyxl: pip install openpyxl") # except Exception as e: # print(f"Excel操作出错: {str(e)}") # # # JSON操作 # print("\n=== JSON操作 ===") # io.write_file(path + "example.json", sample_data) # 自动识别类型 # print("JSON读取结果(records格式):\n", io.read_file(path + "example.json").head()) # # # 自定义JSON格式 # io.write_json(path + "example_columns.json", sample_data, orient="columns") # print("JSON读取结果(columns格式):\n", pd.read_json(path + "example_columns.json")) # # # 分块读取演示 # print("\n=== 分块读取演示 ===") # try: # # 创建大CSV文件用于测试 # large_data = pd.concat([pd.DataFrame(sample_data)] * 1000) # io.write_csv(path + "large_data.csv", large_data) # # print("分块读取CSV文件:") # chunk_count = 0 # for chunk in io.read_csv_chunks(path + "large_data.csv", chunksize=500): # chunk_count += 1 # print(f"处理第{chunk_count}块, 行数: {len(chunk)}") # print(f"共处理{chunk_count}块数据") # except Exception as e: # print(f"分块读取失败: {str(e)}") # # # 数据验证演示 # print("\n=== 数据验证演示 ===") # test_data = [ # {"name": "Alice", "age": 25, "city": "New York"}, # {"name": "Bob", "age": "thirty", "city": None}, # 故意制造错误 # {"name": "Charlie", "age": 35, "city": "London"} # ] # test_df = pd.DataFrame(test_data) # # # 定义验证规则 # rules = { # "age": ["int"], # "city": ["object"] # } # # errors = io.validate_data(test_df, rules=rules, check_na=True) # if errors: # print("数据验证发现错误:") # for col, msgs in errors.items(): # print(f"{col}:") # for msg in msgs: # print(f" - {msg}") # else: # print("数据验证通过")