Test driving Iceberg from Impala
You complete a task that creates Iceberg tables from Impala with mock data that you can test drive using your own queries. You learn how to work with partitioned tables.
- You must meet the prerequisites to query Iceberg tables mentioned earlier, including obtaining Ranger access permissions.
 
- In Impala, use a database.
 - 
        Create an Impala table to hold mock data for this task.
        
create external table mock_rows stored as parquet as select x from ( with v as (values (1 as x), (1), (1), (1), (1)) select v.x from v, v v2, v v3, v v4, v v5, v v6 ) a; - 
        Create another Impala table based on mock_rows. 
        
create external table customer_demo stored as parquet as select FROM_TIMESTAMP(DAYS_SUB(now() , cast ( TRUNC(RAND(7)*365*1) as bigint)), 'yyyy-MM') as year_month, DAYS_SUB(now() , cast ( TRUNC(RAND(7)*365*1) as bigint)) as ts, CONCAT( cast ( TRUNC(RAND(1) * 250 + 2) as string), '.' , cast ( TRUNC(RAND(2) * 250 + 2) as string), '.', cast ( TRUNC(RAND(3) * 250 + 2) as string), '.', cast ( TRUNC(RAND(4) * 250 + 2) as string) ) as ip, CONCAT("USER_", cast ( TRUNC(RAND(4) * 1000) as string),'@somedomain.com') as email, CONCAT("USER_", cast ( TRUNC(RAND(5) * 1000) as string)) as username, CONCAT("USER_", cast ( TRUNC(RAND(6) * 100) as string)) as country, cast( RAND(8)*10000 as double) as metric_1, cast( RAND(9)*10000 as double) as metric_2, cast( RAND(10)*10000 as double) as metric_3, cast( RAND(11)*10000 as double) as metric_4, cast( RAND(12)*10000 as double) as metric_5 from mock_rows ; - 
        Create another Impala table based on mock_rows.
        
create external table customer_demo2 stored as parquet as select FROM_TIMESTAMP(DAYS_SUB(now() , cast ( TRUNC(RAND(7)*365*1) as bigint)), 'yyyy-MM') as year_month, DAYS_SUB(now() , cast ( TRUNC(RAND(7)*365*1) as bigint)) as ts, CONCAT( cast ( TRUNC(RAND(1) * 250 + 2) as string), '.' , cast ( TRUNC(RAND(2) * 250 + 2) as string), '.', cast ( TRUNC(RAND(3) * 250 + 2) as string), '.', cast ( TRUNC(RAND(4) * 250 + 2) as string) ) as ip, CONCAT("USER_", cast ( TRUNC(RAND(4) * 1000) as string),'@somedomain.com') as email, CONCAT("USER_", cast ( TRUNC(RAND(5) * 1000) as string)) as username, CONCAT("USER_", cast ( TRUNC(RAND(6) * 100) as string)) as country, cast( RAND(8)*10000 as double) as metric_1, cast( RAND(9)*10000 as double) as metric_2, cast( RAND(10)*10000 as double) as metric_3, cast( RAND(11)*10000 as double) as metric_4, cast( RAND(12)*10000 as double) as metric_5 from mock_rows ; - 
    Create an Iceberg table from the customer_demo table.
CREATE TABLE customer_demo_iceberg STORED BY ICEBERG AS SELECT * FROM customer_demo; - 
    Insert into the customer_demo_iceberg table the results of selecting all data from the customer_demo2 table.
    
INSERT INTO customer_demo_iceberg select * from customer_demo2; INSERT INTO customer_demo_iceberg select * from customer_demo2; INSERT INTO customer_demo_iceberg select * from customer_demo2; - 
  Create an Iceberg table partitioned by the year_month column and based on the customer_demo_iceberg table.
CREATE TABLE customer_demo_iceberg_part PARTITIONED BY(year_month) STORED BY ICEBERG AS SELECT ts, ip , email, username , country, metric_1 , metric_2 , metric_3 , metric_4 , metric_5, year_month FROM customer_demo_iceberg; - 
  Split the partitioned data into manageable files.
ALTER TABLE customer_demo_iceberg_part SET PARTITION SPEC (year_month,BUCKET(15, country)); - 
    Insert the results of reading the customer_demo_iceberg table into the partitioned table.
   
INSERT INTO customer_demo_iceberg_part (year_month, ts, ip, email, username, country, metric_1, metric_2, metric_3, metric_4, metric_5) SELECT year_month, ts, ip, email, username, country, metric_1, metric_2, metric_3, metric_4, metric_5 FROM customer_demo_iceberg; - 
        Run time travel queries on the Iceberg tables, using the history output to get the
          snapshot id, and substitute the id in the second SELECT query.
        
SELECT * FROM customer_demo_iceberg FOR SYSTEM_TIME AS OF '2021-12-09 05:39:18.689000000' LIMIT 100; DESCRIBE HISTORY customer_demo_iceberg; SELECT * FROM customer_demo_iceberg FOR SYSTEM_VERSION AS OF <snapshot id> LIMIT 100; 
