[离线计算-Spark|Hive] HDFS小文件处理开发者社区

[离线计算-Spark|Hive] HDFS小文件处理

 //获取分区路径
 Set<String> partitionPaths = profile.getPartitionPaths();
 //根据先前提交期间写入的记录获取平均记录大小。用于估计有多少记录打包到一个文件中。
 long averageRecordSize = averageBytesPerRecord(table.getMetaClient().getActiveTimeline().getCommitTimeline().filterCompletedInstants(),config);
 LOG.info("AvgRecordSize => " + averageRecordSize);
 //获取每个分区文件路径下小文件
 Map<String, List<SmallFile>> partitionSmallFilesMap =
        getSmallFilesForPartitions(new ArrayList<String>(partitionPaths), jsc);
for (String partitionPath : partitionPaths) {
     List<SmallFile> smallFiles = partitionSmallFilesMap.get(partitionPath);
    //未分配的写入记录
    long totalUnassignedInserts = pStat.getNumInserts();  
    for (SmallFile smallFile : smallFiles) {
      //hoodie.parquet.max.file.size 数据文件最大大小，Hudi将试着维护文件大小到该指定值
      //算出数据文件大小 - 小文件 就是剩余可以写入文件大小， 除以平均记录大小就是插入的记录行数      
      long recordsToAppend = Math.min((config.getParquetMaxFileSize() - smallFile.sizeBytes) / averageRecordSize, totalUnassignedInserts);
        //分配记录到小文件中
        if (recordsToAppend > 0 && totalUnassignedInserts > 0) {
            // create a new bucket or re-use an existing bucket
            int bucket;
            if (updateLocationToBucket.containsKey(smallFile.location.getFileId())) {
              bucket = updateLocationToBucket.get(smallFile.location.getFileId());
              LOG.info("Assigning " + recordsToAppend + " inserts to existing update bucket " + bucket);
            } else {
              bucket = addUpdateBucket(partitionPath, smallFile.location.getFileId());
              LOG.info("Assigning " + recordsToAppend + " inserts to new update bucket " + bucket);
            bucketNumbers.add(bucket);
            recordsPerBucket.add(recordsToAppend);
            //减去已经分配的记录数
            totalUnassignedInserts -= recordsToAppend;
        //如果记录没有分配完
        if (totalUnassignedInserts > 0) {
            //hoodie.copyonwrite.insert.split.size 每个分区条数
            long insertRecordsPerBucket = config.getCopyOnWriteInsertSplitSize();
            //是否自动计算每个分区条数
            if (config.shouldAutoTuneInsertSplits()) {
                insertRecordsPerBucket = config.getParquetMaxFileSize() / averageRecordSize;
           //计算要创建的bucket
           int insertBuckets = (int) Math.ceil((1.0 * totalUnassignedInserts) / insertRecordsPerBucket); 
          for (int b = 0; b < insertBuckets; b++) {
            bucketNumbers.add(totalBuckets);
            if (b == insertBuckets - 1) {
              //针对最后一个buket处理，就是写完剩下的记录
              recordsPerBucket.add(totalUnassignedInserts - (insertBuckets - 1) * insertRecordsPerBucket);
            } else {
              recordsPerBucket.add(insertRecordsPerBucket);
            BucketInfo bucketInfo = new BucketInfo();
            bucketInfo.bucketType = BucketType.INSERT;
            bucketInfo.partitionPath = partitionPath;
            bucketInfo.fileIdPrefix = FSUtils.createNewFileIdPfx();
            bucketInfoMap.put(totalBuckets, bucketInfo);
            totalBuckets++;
}

 if (!commitTimeline.empty()) { // if we have some commits
      HoodieInstant latestCommitTime = commitTimeline.lastInstant().get();
      List<HoodieBaseFile> allFiles = table.getBaseFileOnlyView()
          .getLatestBaseFilesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp()).collect(Collectors.toList());
      for (HoodieBaseFile file : allFiles) {
        //获取小于 hoodie.parquet.small.file.limit 参数值就为小文件
        if (file.getFileSize() < config.getParquetSmallFileLimit()) {
          String filename = file.getFileName();
          SmallFile sf = new SmallFile();
          sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename), FSUtils.getFileId(filename));
          sf.sizeBytes = file.getFileSize();

[离线计算-Spark|Hive] HDFS小文件处理

[离线计算-Spark|Hive] HDFS小文件处理

背景

小文件解决思路

Hudi小文件处理