public class SparkTableUtil
extends java.lang.Object
Modifier and Type | Class and Description |
---|---|
static class |
SparkTableUtil.SparkPartition
Class representing a table partition.
|
Modifier and Type | Method and Description |
---|---|
static java.util.List<SparkTableUtil.SparkPartition> |
getPartitions(org.apache.spark.sql.SparkSession spark,
java.lang.String table)
Returns all partitions in the table.
|
static java.util.List<SparkTableUtil.SparkPartition> |
getPartitions(org.apache.spark.sql.SparkSession spark,
org.apache.spark.sql.catalyst.TableIdentifier tableIdent)
Returns all partitions in the table.
|
static java.util.List<SparkTableUtil.SparkPartition> |
getPartitionsByFilter(org.apache.spark.sql.SparkSession spark,
java.lang.String table,
java.lang.String predicate)
Returns partitions that match the specified 'predicate'.
|
static java.util.List<SparkTableUtil.SparkPartition> |
getPartitionsByFilter(org.apache.spark.sql.SparkSession spark,
org.apache.spark.sql.catalyst.TableIdentifier tableIdent,
org.apache.spark.sql.catalyst.expressions.Expression predicateExpr)
Returns partitions that match the specified 'predicate'.
|
static void |
importSparkPartitions(org.apache.spark.sql.SparkSession spark,
java.util.List<SparkTableUtil.SparkPartition> partitions,
Table targetTable,
PartitionSpec spec,
java.lang.String stagingDir)
Import files from given partitions to an Iceberg table.
|
static void |
importSparkTable(org.apache.spark.sql.SparkSession spark,
org.apache.spark.sql.catalyst.TableIdentifier sourceTableIdent,
Table targetTable,
java.lang.String stagingDir)
Import files from an existing Spark table to an Iceberg table.
|
static java.util.List<DataFile> |
listPartition(java.util.Map<java.lang.String,java.lang.String> partition,
java.lang.String uri,
java.lang.String format,
PartitionSpec spec,
org.apache.hadoop.conf.Configuration conf,
MetricsConfig metricsConfig)
Returns the data files in a partition by listing the partition location.
|
static java.util.List<DataFile> |
listPartition(SparkTableUtil.SparkPartition partition,
PartitionSpec spec,
SerializableConfiguration conf,
MetricsConfig metricsConfig)
Returns the data files in a partition by listing the partition location.
|
static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> |
partitionDF(org.apache.spark.sql.SparkSession spark,
java.lang.String table)
Returns a DataFrame with a row for each partition in the table.
|
static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> |
partitionDFByFilter(org.apache.spark.sql.SparkSession spark,
java.lang.String table,
java.lang.String expression)
Returns a DataFrame with a row for each partition that matches the specified 'expression'.
|
public static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> partitionDF(org.apache.spark.sql.SparkSession spark, java.lang.String table)
spark
- a Spark sessiontable
- a table name and (optional) databasepublic static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> partitionDFByFilter(org.apache.spark.sql.SparkSession spark, java.lang.String table, java.lang.String expression)
spark
- a Spark session.table
- name of the table.expression
- The expression whose matching partitions are returned.public static java.util.List<SparkTableUtil.SparkPartition> getPartitions(org.apache.spark.sql.SparkSession spark, java.lang.String table)
spark
- a Spark sessiontable
- a table name and (optional) databasepublic static java.util.List<SparkTableUtil.SparkPartition> getPartitions(org.apache.spark.sql.SparkSession spark, org.apache.spark.sql.catalyst.TableIdentifier tableIdent)
spark
- a Spark sessiontableIdent
- a table identifierpublic static java.util.List<SparkTableUtil.SparkPartition> getPartitionsByFilter(org.apache.spark.sql.SparkSession spark, java.lang.String table, java.lang.String predicate)
spark
- a Spark sessiontable
- a table name and (optional) databasepredicate
- a predicate on partition columnspublic static java.util.List<SparkTableUtil.SparkPartition> getPartitionsByFilter(org.apache.spark.sql.SparkSession spark, org.apache.spark.sql.catalyst.TableIdentifier tableIdent, org.apache.spark.sql.catalyst.expressions.Expression predicateExpr)
spark
- a Spark sessiontableIdent
- a table identifierpredicateExpr
- a predicate expression on partition columnspublic static java.util.List<DataFile> listPartition(SparkTableUtil.SparkPartition partition, PartitionSpec spec, SerializableConfiguration conf, MetricsConfig metricsConfig)
partition
- a partitionconf
- a serializable Hadoop confmetricsConfig
- a metrics confpublic static java.util.List<DataFile> listPartition(java.util.Map<java.lang.String,java.lang.String> partition, java.lang.String uri, java.lang.String format, PartitionSpec spec, org.apache.hadoop.conf.Configuration conf, MetricsConfig metricsConfig)
partition
- partition key, e.g., "a=1/b=2"uri
- partition location URIformat
- partition format, avro or parquetconf
- a Hadoop confmetricsConfig
- a metrics confpublic static void importSparkTable(org.apache.spark.sql.SparkSession spark, org.apache.spark.sql.catalyst.TableIdentifier sourceTableIdent, Table targetTable, java.lang.String stagingDir)
spark
- a Spark sessionsourceTableIdent
- an identifier of the source Spark tabletargetTable
- an Iceberg table where to import the datastagingDir
- a staging directory to store temporary manifest filespublic static void importSparkPartitions(org.apache.spark.sql.SparkSession spark, java.util.List<SparkTableUtil.SparkPartition> partitions, Table targetTable, PartitionSpec spec, java.lang.String stagingDir)
spark
- a Spark sessionpartitions
- partitions to importtargetTable
- an Iceberg table where to import the dataspec
- a partition specstagingDir
- a staging directory to store temporary manifest files