启动 spark-sql
因为 iceberg 相关的 jars 已经在 ${SPARK_HOME}/jars 目录,所以不用 --jars
或者 --package 参数。
spark-sql --master local[1] \--conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions \--conf spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog \--conf spark.sql.catalog.spark_catalog.type=hive
创建普通表
create table t1(c1 string) stored as textfile;
load data local inpath '/etc/profile' into table t1;
创建 iceberg 表
create table ti(c1 string) using iceberg;
show create table ti;
CREATE TABLE spark_catalog.test.ti (c1 STRING)
USING iceberg
LOCATION 'hdfs://bmr-cluster/apps/spark/warehouse/test.db/ti'
TBLPROPERTIES ('current-snapshot-id' = 'none','format' = 'iceberg/parquet','format-version' = '2','write.parquet.compression-codec' = 'zstd');
这时表目录下仅有一个 metadata 目录,metadata 目录下有一个 metadata.json 文件。
[hive@master-aa9bafd-2 ~]$ hadoop fs -ls hdfs://bmr-cluster/apps/spark/warehouse/test.db/ti;
Found 1 items
drwxr-xr-x - hive hadoop 0 2024-09-18 16:44 hdfs://bmr-cluster/apps/spark/warehouse/test.db/ti/metadata
[hive@master-aa9bafd-2 ~]$ hadoop fs -ls hdfs://bmr-cluster/apps/spark/warehouse/test.db/ti/metadata
Found 1 items
-rw-r--r-- 3 hive hadoop 907 2024-09-18 16:44 hdfs://bmr-cluster/apps/spark/warehouse/test.db/ti/metadata/00000-831f9491-0ebf-45e6-9ead-902bc62ba658.metadata.json
- metadata.json 文件内容:
{"format-version" : 2,"table-uuid" : "851c7d16-3dde-407b-848b-f4c07522532f","location" : "hdfs://bmr-cluster/apps/spark/warehouse/test.db/ti","last-sequence-number" : 0,"last-updated-ms" : 1726649083494,"last-column-id" : 1,"current-schema-id" : 0,"schemas" : [ {"type" : "struct","schema-id" : 0,"fields" : [ {"id" : 1,"name" : "c1","required" : false,"type" : "string"} ]} ],"default-spec-id" : 0,"partition-specs" : [ {"spec-id" : 0,"fields" : [ ]} ],"last-partition-id" : 999,"default-sort-order-id" : 0,"sort-orders" : [ {"order-id" : 0,"fields" : [ ]} ],"properties" : {"owner" : "hive","write.parquet.compression-codec" : "zstd"},"current-snapshot-id" : -1,"refs" : { },"snapshots" : [ ],"statistics" : [ ],"snapshot-log" : [ ],"metadata-log" : [ ]
}
insert
insert into ti select * from t1;
插入记录后,表目录下有data 目录。
[hive@master-aa9bafd-2 ~]$hadoop fs -ls hdfs://bmr-cluster/apps/spark/warehouse/test.db/ti
Found 2 items
drwxr-xr-x - hive hadoop 0 2024-09-18 16:50 hdfs://bmr-cluster/apps/spark/warehouse/test.db/ti/data
drwxr-xr-x - hive hadoop 0 2024-09-18 16:50 hdfs://bmr-cluster/apps/spark/warehouse/test.db/ti/metadata
再次执行 show create table,可以看到 current-snapshot-id 发生了变化。
spark-sql (test)> show create table ti;
CREATE TABLE spark_catalog.test.ti (c1 STRING)
USING iceberg
LOCATION 'hdfs://bmr-cluster/apps/spark/warehouse/test.db/ti'
TBLPROPERTIES ('current-snapshot-id' = '5859224922072073702','format' = 'iceberg/parquet','format-version' = '2','write.parquet.compression-codec' = 'zstd')Time taken: 0.034 seconds, Fetched 1 row(s)
metadata
metadata 下有4个文件,去掉创建时生成的 00000-831f9491-0ebf-45e6-9ead-902bc62ba658.metadata.json,现在解释以下 3 个文件。
[hive@master-aa9bafd-2 ~]$ hadoop fs -ls hdfs://bmr-cluster/apps/spark/warehouse/test.db/ti/metadata
Found 4 items
-rw-r--r-- 3 hive hadoop 907 2024-09-18 16:44 hdfs://bmr-cluster/apps/spark/warehouse/test.db/ti/metadata/00000-831f9491-0ebf-45e6-9ead-902bc62ba658.metadata.json
-rw-r--r-- 3 hive hadoop 2006 2024-09-18 16:50 hdfs://bmr-cluster/apps/spark/warehouse/test.db/ti/metadata/00001-c38f8b27-0e16-41f1-b8d2-410ba46fa276.metadata.json
-rw-r--r-- 3 hive hadoop 6618 2024-09-18 16:50 hdfs://bmr-cluster/apps/spark/warehouse/test.db/ti/metadata/c7bf675a-ef11-4dd3-a9a2-4dd9cd7c300c-m0.avro
-rw-r--r-- 3 hive hadoop 4269 2024-09-18 16:50 hdfs://bmr-cluster/apps/spark/warehouse/test.db/ti/metadata/snap-5859224922072073702-1-c7bf675a-ef11-4dd3-a9a2-4dd9cd7c300c.avro
- 第1个文件 00001-c38f8b27-0e16-41f1-b8d2-410ba46fa276.metadata.json
当前的 metadata 文件,包含
{"format-version" : 2,"table-uuid" : "851c7d16-3dde-407b-848b-f4c07522532f","location" : "hdfs://bmr-cluster/apps/spark/warehouse/test.db/ti","last-sequence-number" : 1,"last-updated-ms" : 1726649449201,"last-column-id" : 1,"current-schema-id" : 0,"schemas" : [ {"type" : "struct","schema-id" : 0,"fields" : [ {"id" : 1,"name" : "c1","required" : false,"type" : "string"} ]} ],"default-spec-id" : 0,"partition-specs" : [ {"spec-id" : 0,"fields" : [ ]} ],"last-partition-id" : 999,"default-sort-order-id" : 0,"sort-orders" : [ {"order-id" : 0,"fields" : [ ]} ],"properties" : {"owner" : "hive","write.parquet.compression-codec" : "zstd"},"current-snapshot-id" : 5859224922072073702,"refs" : {"main" : {"snapshot-id" : 5859224922072073702,"type" : "branch"}},"snapshots" : [ {"sequence-number" : 1,"snapshot-id" : 5859224922072073702,"timestamp-ms" : 1726649449201,"summary" : {"operation" : "append","spark.app.id" : "local-1726648289519","added-data-files" : "1","added-records" : "88","added-files-size" : "1735","changed-partition-count" : "1","total-records" : "88","total-files-size" : "1735","total-data-files" : "1","total-delete-files" : "0","total-position-deletes" : "0","total-equality-deletes" : "0"},"manifest-list" : "hdfs://bmr-cluster/apps/spark/warehouse/test.db/ti/metadata/snap-5859224922072073702-1-c7bf675a-ef11-4dd3-a9a2-4dd9cd7c300c.avro","schema-id" : 0} ],"statistics" : [ ],"snapshot-log" : [ {"timestamp-ms" : 1726649449201,"snapshot-id" : 5859224922072073702} ],"metadata-log" : [ {"timestamp-ms" : 1726649083494,"metadata-file" : "hdfs://bmr-cluster/apps/spark/warehouse/test.db/ti/metadata/00000-831f9491-0ebf-45e6-9ead-902bc62ba658.metadata.json"} ]
}
snapshots 表明当前快照信息。
- 第2个文件 snap-5859224922072073702-1-c7bf675a-ef11-4dd3-a9a2-4dd9cd7c300c.avro 是 metafest list 文件。
包含 manifest 文件 c7bf675a-ef11-4dd3-a9a2-4dd9cd7c300c-m0.avro。
hadoop fs -text hdfs://bmr-cluster/apps/spark/warehouse/test.db/ti/metadata/snap-5859224922072073702-1-c7bf675a-ef11-4dd3-a9a2-4dd9cd7c300c.avro
{"manifest_path":"hdfs://bmr-cluster/apps/spark/warehouse/test.db/ti/metadata/c7bf675a-ef11-4dd3-a9a2-4dd9cd7c300c-m0.avro","manifest_length":6618,"partition_spec_id":0,"content":0,"sequence_number":1,"min_sequence_number":1,"added_snapshot_id":5859224922072073702,"added_data_files_count":1,"existing_data_files_count":0,"deleted_data_files_count":0,"added_rows_count":88,"existing_rows_count":0,"deleted_rows_count":0,"partitions":{"array":[]}}
- 第3个文件 c7bf675a-ef11-4dd3-a9a2-4dd9cd7c300c-m0.avro 是 manifest 文件。
[hive@master-aa9bafd-2 ~]$ hadoop fs -text hdfs://bmr-cluster/apps/spark/warehouse/test.db/ti/metadata/c7bf675a-ef11-4dd3-a9a2-4dd9cd7c300c-m0.avro
输出结果 中说明 data_file:
{"status":1,"snapshot_id":{"long":5859224922072073702},"sequence_number":null,"file_sequence_number":null,"data_file":{"content":0,"file_path":"hdfs://bmr-cluster/apps/spark/warehouse/test.db/ti/data/00000-3-9038b786-1a74-4a42-ac4e-45a3db21e4b5-00001.parquet","file_format":"PARQUET","partition":{},"record_count":88,"file_size_in_bytes":1735,"column_sizes":{"array":[{"key":1,"value":1375}]},"value_counts":{"array":[{"key":1,"value":88}]},"null_value_counts":{"array":[{"key":1,"value":0}]},"nan_value_counts":{"array":[]},"lower_bounds":{"array":[{"key":1,"value":""}]},"upper_bounds":{"array":[{"key":1,"value":"}"}]},"key_metadata":null,"split_offsets":{"array":[4]},"equality_ids":null,"sort_order_id":{"int":0}}}
每次 insert , metadata 目录增加3 个文件
再次执行
insert into ti select * from t1;
可以看到 metadata 文件增加了 3 个文件。