From fa94bdbdce3052b44e69a50d0c7b5f053ce972be Mon Sep 17 00:00:00 2001
From: baklarz <baklarz@ca.ibm.com>
Date: Mon, 11 Mar 2024 17:10:03 -0400
Subject: [PATCH] Deployed 1bcea72 with MkDocs version: 1.3.1

---
 search/search_index.json   |   2 +-
 sitemap.xml                |  90 ++++++++++++++++++-------------------
 sitemap.xml.gz             | Bin 223 -> 223 bytes
 wxd-certificate/index.html |   3 +-
 4 files changed, 48 insertions(+), 47 deletions(-)

diff --git a/search/search_index.json b/search/search_index.json
index 552de2e..1135200 100644
--- a/search/search_index.json
+++ b/search/search_index.json
@@ -1 +1 @@
-{"config":{"indexing":"full","lang":["en"],"min_search_length":3,"prebuild_index":false,"separator":"[\\s\\-]+"},"docs":[{"location":"","text":"Introducing watsonx.data The next-gen watsonx.data lakehouse is designed to overcome the costs and complexities enterprises face. This will be the world\u2019s first and only open data store with multi-engine support that is built for hybrid deployment across your entire ecosystem. WatsonX.data is the only lakehouse with multiple query engines allowing you to optimize costs and performance by pairing the right workload with the right engine. Run all workloads from a single pane of glass, eliminating trade-offs with convenience while still improving cost and performance. Deploy anywhere with full support for hybrid-cloud and multi cloud environments. Shared metadata across multiple engines eliminates the need to re-catalog, accelerating time to value while ensuring governance and eliminating costly implementation efforts. This lab uses the watsonx.data developer package. The Developer package is meant to be used on single nodes. While it uses the same code base, there are some restrictions, especially on scale. In this lab, we will open some additional ports as well to understand how everything works. We will also use additional utilities to illustrate connectivity and what makes the watsonx.data system \"open\". We organized this lab into a number of sections that cover many of the highlights and key features of watsonx.data. Access a TechZone or VMWare image for testing Checking watsonx.data status Introduction to watsonx.data components Analytical SQL Advanced SQL functions Time Travel and Federation Working with Object Store Buckets In addition, there is an Appendix which includes common errors and potential fixes or workarounds. Watsonx.data Developer Image The watsonx.data system is running on a virtual machine with the following resources: 4 vCPUs 16Gb of memory 400Gb of disk This is sufficient for running this exercises found in this lab but should not be used for performance testing or dealing with large data sets. Watsonx.data Level 3 Technical Training This system is used as a basis for the watsonx.data Level 3 Technical Training. For the detailed lab material, please refer to the following PDF found in Seismic: https://ibm.seismic.com/Link/Content/DCG37pjmPj7VmGCHj2Df8fHVmDJj","title":"Introducing watsonx.data"},{"location":"#introducing-watsonxdata","text":"The next-gen watsonx.data lakehouse is designed to overcome the costs and complexities enterprises face. This will be the world\u2019s first and only open data store with multi-engine support that is built for hybrid deployment across your entire ecosystem. WatsonX.data is the only lakehouse with multiple query engines allowing you to optimize costs and performance by pairing the right workload with the right engine. Run all workloads from a single pane of glass, eliminating trade-offs with convenience while still improving cost and performance. Deploy anywhere with full support for hybrid-cloud and multi cloud environments. Shared metadata across multiple engines eliminates the need to re-catalog, accelerating time to value while ensuring governance and eliminating costly implementation efforts. This lab uses the watsonx.data developer package. The Developer package is meant to be used on single nodes. While it uses the same code base, there are some restrictions, especially on scale. In this lab, we will open some additional ports as well to understand how everything works. We will also use additional utilities to illustrate connectivity and what makes the watsonx.data system \"open\". We organized this lab into a number of sections that cover many of the highlights and key features of watsonx.data. Access a TechZone or VMWare image for testing Checking watsonx.data status Introduction to watsonx.data components Analytical SQL Advanced SQL functions Time Travel and Federation Working with Object Store Buckets In addition, there is an Appendix which includes common errors and potential fixes or workarounds.","title":"Introducing watsonx.data"},{"location":"#watsonxdata-developer-image","text":"The watsonx.data system is running on a virtual machine with the following resources: 4 vCPUs 16Gb of memory 400Gb of disk This is sufficient for running this exercises found in this lab but should not be used for performance testing or dealing with large data sets.","title":"Watsonx.data Developer Image"},{"location":"#watsonxdata-level-3-technical-training","text":"This system is used as a basis for the watsonx.data Level 3 Technical Training. For the detailed lab material, please refer to the following PDF found in Seismic: https://ibm.seismic.com/Link/Content/DCG37pjmPj7VmGCHj2Df8fHVmDJj","title":"Watsonx.data Level 3 Technical Training"},{"location":"wxd-acknowledgements/","text":"Acknowledgments We would like to thank all the development team for helping to deliver this release given the tremendous deadlines and constraints that they have been under. The initial lab was created by Deepak Rangarao with contributions from development. Additional material was supplied by Daniel Hancock and feedback from the members of the watsonx.data activation community. Formatting and script development was done by George Baklarz. The contents of this eBook are the result of a lot of research and testing based on the contents of watsonx.data. Results are based on a specific version of watsonx.data, so you may have different results if using an older or newer version of the development kit. Support For any questions regarding the lab, including any suggestions, general comments, or bug reports, please contact: George Baklarz baklarz@ca.ibm.com Daniel Hancock daniel.hancock@us.ibm.com We would also appreciate any feedback on the successful use of the lab. Thanks for using watsonx.data! Dan, Deepak & George","title":"Acknowledgements"},{"location":"wxd-acknowledgements/#acknowledgments","text":"We would like to thank all the development team for helping to deliver this release given the tremendous deadlines and constraints that they have been under. The initial lab was created by Deepak Rangarao with contributions from development. Additional material was supplied by Daniel Hancock and feedback from the members of the watsonx.data activation community. Formatting and script development was done by George Baklarz. The contents of this eBook are the result of a lot of research and testing based on the contents of watsonx.data. Results are based on a specific version of watsonx.data, so you may have different results if using an older or newer version of the development kit.","title":"Acknowledgments"},{"location":"wxd-acknowledgements/#support","text":"For any questions regarding the lab, including any suggestions, general comments, or bug reports, please contact: George Baklarz baklarz@ca.ibm.com Daniel Hancock daniel.hancock@us.ibm.com We would also appreciate any feedback on the successful use of the lab. Thanks for using watsonx.data! Dan, Deepak & George","title":"Support"},{"location":"wxd-advanced/","text":"Advanced Functions Watsonx.data supports several types of functions including: Mathematical functions Conversion functions String functions Regular expression functions Window functions URL functions Geospatial functions For a complete list see - https://prestodb.io/docs/current/functions.html . We will look at using a few simple examples as part of this lab. Switch to the bin directory. cd /root/ibm-lh-dev/bin Connect to the Workshop Schema. ./presto-cli --catalog iceberg_data --schema workshop Concatenation of one or more string/varchar values Note: We are using a combination of the concat string function and the cast conversion function as part of this query. select concat(cast(custkey as varchar),'--',name) from customer limit 2; _col0 ------------------------- 376--Customer#000000376 377--Customer#000000377 (2 rows) Date functions Date functions can be used as part of the projected columns or in the predicate/where clause. Select orders from the last 2 days. select orderdate from orders where orderdate > date '1998-08-02' - interval '2' day; orderdate ------------ 1998-08-02 1998-08-02 1998-08-01 1998-08-01 1998-08-02 1998-08-01 1998-08-01 1998-08-01 1998-08-02 1998-08-02 1998-08-02 1998-08-02 (12 rows) Number of orders by year. select distinct year(orderdate), count(orderkey) from orders group by year(orderdate); _col0 | _col1 -------+------- 1993 | 2307 1994 | 2303 1998 | 1346 1996 | 2297 1995 | 2204 1992 | 2256 1997 | 2287 (7 rows) Geospatial functions There are 3 basic geometries, then some complex geometries. The basic geometries include: Points Lines Polygons Points You could use https://www.latlong.net to get the longitude/latitude given any address. select ST_Point(-121.748360,37.195840) as SVL, ST_Point(-122.378952, 37.621311) as SFO; SVL | SFO -----------------------------+------------------------------- POINT (-121.74836 37.19584) | POINT (-122.378952 37.621311) (1 row) Lines You could use https://www.latlong.net to get the longitude/latitude for 2 points and then create a straight line from it. Below is just a small stretch of the road leading to IBM SVL campus. select ST_LineFromText('LINESTRING (-121.74294303079807 37.19665657093434, -121.73659072815602 37.20102399761407)'); _col0 ------------------------------------------------------------------------------------------- LINESTRING (-121.74294303079807 37.19665657093434, -121.73659072815602 37.20102399761407) (1 row) Polygons You could use https://geojson.io/#map=16.39/37.196336/-121.746303 to click around and generate the coordinates for a polygon of any shape. The following is a polygon of the IBM Silicon Valley campus. select ST_Polygon('POLYGON ( (-121.74418635253568 37.196001834113844, -121.74499684288966 37.19668005184322, -121.74584008032835 37.19707784979194, -121.74629035274705 37.197645197338105, -121.74672425162339 37.198186455965086, -121.74705172247337 37.19828427337538, -121.74760023614738 37.19827775221884, -121.74848440744239 37.19836252721197, -121.74932764488139 37.19789300297414, -121.75039192514376 37.19746260319114, -121.75130884352407 37.19721479614175, -121.75195559845278 37.1963670290329, -121.75198015876644 37.19555185937345, -121.7508585711051 37.19458016564036, -121.74940132582242 37.19447582194559, -121.74841891327239 37.1942866986312, -121.7474446874937 37.193556286900346, -121.74418635253568 37.196001834113844))'); Truncated output ------------------------------------------------------------------------------------------------------------------------------------------------------> POLYGON ((-121.74418635253568 37.196001834113844, -121.74499684288966 37.19668005184322, -121.74584008032835 37.19707784979194, -121.74629035274705 3> (1 row) So now that we have 3 basic geometries Point, Line and Polygon we can perform different operations on spatial data including: Distance between 2 points Point in polygon Intersection of line and polygon \u2003 Distance between SFO airport and IBM SVL We can now use geospatial functions in a nested way to find the distance between 2 points. select ST_Distance(to_spherical_geography(ST_Point(-122.378952, 37.621311)), to_spherical_geography(ST_Point(-121.748360,37.195840)))*0.000621371 as distance_in_miles; distance_in_miles -------------------- 45.408431373195654 (1 row) Exit Presto. quit;","title":"Advanced Functions"},{"location":"wxd-advanced/#advanced-functions","text":"Watsonx.data supports several types of functions including: Mathematical functions Conversion functions String functions Regular expression functions Window functions URL functions Geospatial functions For a complete list see - https://prestodb.io/docs/current/functions.html . We will look at using a few simple examples as part of this lab. Switch to the bin directory. cd /root/ibm-lh-dev/bin Connect to the Workshop Schema. ./presto-cli --catalog iceberg_data --schema workshop","title":"Advanced Functions"},{"location":"wxd-advanced/#concatenation-of-one-or-more-stringvarchar-values","text":"Note: We are using a combination of the concat string function and the cast conversion function as part of this query. select concat(cast(custkey as varchar),'--',name) from customer limit 2; _col0 ------------------------- 376--Customer#000000376 377--Customer#000000377 (2 rows)","title":"Concatenation of one or more string/varchar values"},{"location":"wxd-advanced/#date-functions","text":"Date functions can be used as part of the projected columns or in the predicate/where clause. Select orders from the last 2 days. select orderdate from orders where orderdate > date '1998-08-02' - interval '2' day; orderdate ------------ 1998-08-02 1998-08-02 1998-08-01 1998-08-01 1998-08-02 1998-08-01 1998-08-01 1998-08-01 1998-08-02 1998-08-02 1998-08-02 1998-08-02 (12 rows) Number of orders by year. select distinct year(orderdate), count(orderkey) from orders group by year(orderdate); _col0 | _col1 -------+------- 1993 | 2307 1994 | 2303 1998 | 1346 1996 | 2297 1995 | 2204 1992 | 2256 1997 | 2287 (7 rows)","title":"Date functions"},{"location":"wxd-advanced/#geospatial-functions","text":"There are 3 basic geometries, then some complex geometries. The basic geometries include: Points Lines Polygons","title":"Geospatial functions"},{"location":"wxd-advanced/#points","text":"You could use https://www.latlong.net to get the longitude/latitude given any address. select ST_Point(-121.748360,37.195840) as SVL, ST_Point(-122.378952, 37.621311) as SFO; SVL | SFO -----------------------------+------------------------------- POINT (-121.74836 37.19584) | POINT (-122.378952 37.621311) (1 row)","title":"Points"},{"location":"wxd-advanced/#lines","text":"You could use https://www.latlong.net to get the longitude/latitude for 2 points and then create a straight line from it. Below is just a small stretch of the road leading to IBM SVL campus. select ST_LineFromText('LINESTRING (-121.74294303079807 37.19665657093434, -121.73659072815602 37.20102399761407)'); _col0 ------------------------------------------------------------------------------------------- LINESTRING (-121.74294303079807 37.19665657093434, -121.73659072815602 37.20102399761407) (1 row)","title":"Lines"},{"location":"wxd-advanced/#polygons","text":"You could use https://geojson.io/#map=16.39/37.196336/-121.746303 to click around and generate the coordinates for a polygon of any shape. The following is a polygon of the IBM Silicon Valley campus. select ST_Polygon('POLYGON ( (-121.74418635253568 37.196001834113844, -121.74499684288966 37.19668005184322, -121.74584008032835 37.19707784979194, -121.74629035274705 37.197645197338105, -121.74672425162339 37.198186455965086, -121.74705172247337 37.19828427337538, -121.74760023614738 37.19827775221884, -121.74848440744239 37.19836252721197, -121.74932764488139 37.19789300297414, -121.75039192514376 37.19746260319114, -121.75130884352407 37.19721479614175, -121.75195559845278 37.1963670290329, -121.75198015876644 37.19555185937345, -121.7508585711051 37.19458016564036, -121.74940132582242 37.19447582194559, -121.74841891327239 37.1942866986312, -121.7474446874937 37.193556286900346, -121.74418635253568 37.196001834113844))'); Truncated output ------------------------------------------------------------------------------------------------------------------------------------------------------> POLYGON ((-121.74418635253568 37.196001834113844, -121.74499684288966 37.19668005184322, -121.74584008032835 37.19707784979194, -121.74629035274705 3> (1 row) So now that we have 3 basic geometries Point, Line and Polygon we can perform different operations on spatial data including: Distance between 2 points Point in polygon Intersection of line and polygon \u2003 Distance between SFO airport and IBM SVL We can now use geospatial functions in a nested way to find the distance between 2 points. select ST_Distance(to_spherical_geography(ST_Point(-122.378952, 37.621311)), to_spherical_geography(ST_Point(-121.748360,37.195840)))*0.000621371 as distance_in_miles; distance_in_miles -------------------- 45.408431373195654 (1 row) Exit Presto. quit;","title":"Polygons"},{"location":"wxd-analytics/","text":"Analytic Workloads Watsonx.data is based on open source PrestoDB, a distributed query engine that enables querying data stored in open file formats using open table formats for optimization and performance. Some of the characteristics which you will learn and see in action include: Compute processing is performed in memory and in parallel. Data is pipelined between query stages and over the network reducing latency overhead that one would have if disk I/O were involved. Executing and analyzing analytic workloads Let us start with some simple examples of running queries and analyze the execution. We can either use the dBeaver interface or the watsonx.data CLI. We will eventually be able to use the watsonx.data console UI as well but for the moment it is under construction. Connect to watsonx.data Make sure you are the root user and change to the development directory. cd /root/ibm-lh-dev/bin Open the Presto CLI. Note : The workshop schema was created as part of the introduction to Minio. If you have not run that lab, the schema will not be available. Please see the Introduction to Minio section. ./presto-cli --catalog iceberg_data --schema workshop Run a simple scan query which selects customer names and market segment. select name, mktsegment from customer limit 3; name | mktsegment --------------------+------------ Customer#000000376 | AUTOMOBILE Customer#000000377 | MACHINERY Customer#000000378 | BUILDING (3 rows) To understand the query execution plan we use the explain statement. explain select name, mktsegment from customer; - Output[name, mktsegment] => [name:varchar, mktsegment:varchar] Estimates: {rows: 1500 (15.85kB), cpu: 16230.00, memory: 0.00, network: 16230.00} - RemoteStreamingExchange[GATHER] => [name:varchar, mktsegment:varchar] Estimates: {rows: 1500 (15.85kB), cpu: 16230.00, memory: 0.00, network: 16230.00} - TableScan[TableHandle {connectorId='iceberg_data', connectorHandle='workshop.customer$data@Optional[7053670466726060568]', layout='Optional[workshop.customer$data@Optional[7053670466726060568]]'}] => [name:varchar, mktsegment:varchar] Estimates: {rows: 1500 (15.85kB), cpu: 16230.00, memory: 0.00, network: 0.00} mktsegment := 7:mktsegment:varchar (1:38) name := 2:name:varchar (1:38) What you see above is the hierarchy of logical operations to execute the query. Explain the query and focus on IO operations. explain (type io) select name, mktsegment from customer; { \"inputTableColumnInfos\" : [ { \"table\" : { \"catalog\" : \"iceberg_data\", \"schemaTable\" : { \"schema\" : \"workshop\", \"table\" : \"customer\" } }, \"columnConstraints\" : [ ] } ] } Explain physical execution plan for the query. explain (type distributed) select name, mktsegment from customer; Fragment 0 [SINGLE] Output layout: [name, mktsegment] Output partitioning: SINGLE [] Stage Execution Strategy: UNGROUPED_EXECUTION - Output[name, mktsegment] => [name:varchar, mktsegment:varchar] Estimates: {rows: 1500 (15.85kB), cpu: 16230.00, memory: 0.00, network: 16230.00} - RemoteSource[1] => [name:varchar, mktsegment:varchar] Fragment 1 [SOURCE] Output layout: [name, mktsegment] Output partitioning: SINGLE [] Stage Execution Strategy: UNGROUPED_EXECUTION - TableScan[TableHandle {connectorId='iceberg_data', connectorHandle='workshop.customer$data@Optional[7053670466726060568]', layout='Optional[workshop.customer$data@Optional[7053670466726060568]]'}, grouped = false] => [name:varchar, mktsegment:varchar] Estimates: {rows: 1500 (15.85kB), cpu: 16230.00, memory: 0.00, network: 0.00} mktsegment := 7:mktsegment:varchar (1:57) name := 2:name:varchar (1:57) A fragment represents a stage of the distributed plan. The Presto scheduler schedules the execution by each stage, and stages can be run on separate instances. Create explain statement in a visual format. explain (format graphviz) select name, mktsegment from customer; digraph logical_plan { subgraph cluster_0 { label = \"SINGLE\" plannode_1[label=\"{Output[name, mktsegment]|Estimates: \\{rows: ? (?), cpu: ?, memory: ?, network: ?\\} }\", style=\"rounded, filled\", shape=record, fillcolor=white]; plannode_2[label=\"{ExchangeNode[GATHER]|name, mktsegment|Estimates: \\{rows: ? (?), cpu: ?, memory: ?, network: ?\\} }\", style=\"rounded, filled\", shape=record, fillcolor=gold]; plannode_3[label=\"{TableScan | [TableHandle \\{connectorId='iceberg_data', connectorHandle='workshop.customer$data@Optional[7053670466726060568]', layout='Optional[workshop.customer$data@Optional[7053670466726060568]]'\\}]|Estimates: \\{rows: ? (?), cpu: ?, memory: ?, network: ?\\} }\", style=\"rounded, filled\", shape=record, fillcolor=deepskyblue]; } plannode_1 -> plannode_2; plannode_2 -> plannode_3; } We are going to format the output from the explain statement and display it as a graphic. Quit Presto. quit; Place the explain SQL into a file that will be run as a script by Presto. cat <<EOF >/root/ibm-lh-dev/localstorage/volumes/infra/explain.sql explain (format graphviz) select name, mktsegment from customer; EOF Run Presto by pointing to the file with the SQL in it. ./presto-cli --catalog iceberg_data --schema workshop --file /mnt/infra/explain.sql > /tmp/plan.dot We need to get rid of headers and stuff that Presto generated when creating the output (there is no way to turn that off). cat /tmp/plan.dot | sed 's/\"\"/\"/g' | sed -z 's/\"//' | sed '$s/\"//' > /tmp/fixedplan.dot Generate the PNG file from the explain statement. dot -Tpng /tmp/fixedplan.dot > /tmp/plan.png Open a separate terminal window and issue the following command (using the SSH port number and server name supplied in your reservation). Mac OSX user scp -port watsonx@region.techzone-server.com:/tmp/plan.png plan.png && open plan.png Windows user scp -port watsonx@region.techzone-server.com:/tmp/plan.png plan.png & start \"\" \"plan.png\" Linux user (watsonx.data server) eog /tmp/plan.png Creating a Table with User-defined Partitions Connect to Presto with the Workshop Schema. ./presto-cli --catalog iceberg_data --schema workshop Create a partitioned table, based on column mktsegment and copy data from TPCH.TINY.CUSTOMER table. create table iceberg_data.workshop.part_customer with (partitioning = array['mktsegment']) as select * from tpch.tiny.customer; Quit Presto. quit; Inspect object store directory/object/file structure Open your browser and connect to the MinIO console. If you forget the userid and password, use the following command to extract them or use the passwords command. export LH_S3_ACCESS_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_ACCESS_KEY | sed 's/.*=//') export LH_S3_SECRET_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_SECRET_KEY | sed 's/.*=//') echo \"MinIO Userid : \" $LH_S3_ACCESS_KEY echo \"MinIO Password: \" $LH_S3_SECRET_KEY Click on the Object browser tab to show the current buckets in the MinIO system. Select iceberg-bucket. You will see two tables, customer and part_customer. Select part_customer. Then select data. Examining the part_customer, you will notice is the data is split into multiple parquet files stored across multiple directories - a single directory for each unique value of the partition key. Predicate query to utilize partitions Connect to Presto with the Workshop Schema. ./presto-cli --catalog iceberg_data --schema workshop Now that have created a partitioned table, we will execute a SQL statement that will make use of this fact. select * from iceberg_data.\"workshop\".part_customer where mktsegment='MACHINERY'; custkey | name | address | nationkey | phone | acctbal | mktsegment | comment ---------+--------------------+------------------------------------------+-----------+-----------------+---------+------------+---------------------------------------------------------------------------------------------------------------------- 1131 | Customer#000001131 | KVAvB1lwuN qHWDDPNckenmRGULDFduxYRSBXv | 20 | 30-644-540-9044 | 6019.1 | MACHINERY | er the carefully dogged courts m 1133 | Customer#000001133 | FfA0o cMP02Ylzxtmbq8DCOq | 14 | 24-858-762-2348 | 5335.36 | MACHINERY | g to the pending, ironic pinto beans. furiously blithe packages are fina 1141 | Customer#000001141 | A6uzuXpgRPp19ek8K8zd5O | 22 | 32-330-618-9020 | 0.97 | MACHINERY | accounts. furiously pending deposits cajole. c 1149 | Customer#000001149 | 5JOAwCy8MD70TUZJDyxgEBMe | 3 | 13-254-242-3889 | 6287.79 | MACHINERY | ress requests haggle carefully across the fluffily regula 1150 | Customer#000001150 | fUJqzdkQg1 | 21 | 31-236-665-8430 | -117.31 | MACHINERY | usly final dolphins. fluffily bold platelets sleep. slyly unusual attainments lo 1155 | Customer#000001155 | kEDBn1IQWyHyYjgGGs6FiXfm3 | 8 | 18-864-953-3058 | 3510.25 | MACHINERY | ages? fluffily even accounts shall have to boost furiously alongside of the furiously pendin 1158 | Customer#000001158 | btAl2dQdvNV9cEzTwVRloTb08sLYKDopV2cK,p | 10 | 20-487-747-8857 | 3081.79 | MACHINERY | theodolites use stealthy asymptotes. frets integrate even instructions. car 1161 | Customer#000001161 | QD7s2P6QpCC6g9t2aVzKg7y | 19 | 29-213-663-3342 | 591.31 | MACHINERY | ly alongside of the quickly blithe ideas. quickly ironic accounts haggle regul 1165 | Customer#000001165 | h7KTXGSqsn0 | 9 | 19-766-409-6769 | 8177.33 | MACHINERY | jole slyly beside the quickly final accounts. silent, even requests are stealthily ironic, re 1166 | Customer#000001166 | W4FAGNPKcJFebzldtNp8SehhH3 | 17 | 27-869-223-7506 | 507.26 | MACHINERY | before the platelets! carefully bold ideas lose carefully 1169 | Customer#000001169 | 04YQNIYyRRFxUnJsTP36da | 4 | 14-975-169-9356 | 7503.3 | MACHINERY | into beans doubt about the slyly ironic multipliers. carefully regular requests breach theodolites. special packages 1188 | Customer#000001188 | PtwoF3jNQ9r6 GbPIelt GvbNBuDH | 15 | 25-108-989-8154 | 3698.86 | MACHINERY | ts. quickly unusual ideas affix aft 1190 | Customer#000001190 | JwzW9OtxFRXDnVo5hXl8 2A5VxH12 | 15 | 25-538-604-9042 | 2743.63 | MACHINERY | regular deposits according to the pending packages wake blithely among the silent inst 1203 | Customer#000001203 | 9pTq4gggfKoSqQetn0yJR | 16 | 26-370-660-6154 | 5787.69 | MACHINERY | osits nag furiously final accounts. silent pack ... Many more rows Due to the partitioning of this table by mktsegment , it will completely skip scanning a large percentage of the objects in the object store. We run an explain against this query using the following command. explain (format graphviz) select * from iceberg_data.\"workshop\".customer where mktsegment='MACHINERY'; Query Plan ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- digraph logical_plan { subgraph cluster_0 { label = \"SINGLE\" plannode_1[label=\"{Output[custkey, name, address, nationkey, phone, acctbal, mktsegment, comment]|Estimates: \\{rows: 750 (56.84kB), cpu: 232830.00, memory: 0.00, network: 58207.50\\} }\", style=\"rounded, filled\", shape=record, fillcolor=white]; plannode_2[label=\"{ExchangeNode[GATHER]|custkey, name, address, nationkey, phone, acctbal, mktsegment, comment|Estimates: \\{rows: 750 (56.84kB), cpu: 232830.00, memory: 0.00, network: 58207.50\\} }\", style=\"rounded, filled\", shape=record, fillcolor=gold]; plannode_3[label=\"{Filter|(mktsegment) = (VARCHAR'MACHINERY')|Estimates: \\{rows: 750 (56.84kB), cpu: 232830.00, memory: 0.00, network: 0.00\\} }\", style=\"rounded, filled\", shape=record, fillcolor=yellow]; plannode_4[label=\"{TableScan | [TableHandle \\{connectorId='iceberg_data', connectorHandle='workshop.customer$data@Optional[7230522396120575591]', layout='Optional[workshop.customer$data@Optional[7230522396120575591]]'\\}]|Estimates: \\{rows: 1500 (113.69kB), cpu: 116415.00, memory: 0.00, network: 0.00\\} }\", style=\"rounded, filled\", shape=record, fillcolor=deepskyblue]; } plannode_1 -> plannode_2; plannode_2 -> plannode_3; plannode_3 -> plannode_4; } To visualize this, we are going to run this command and place the results into a temporary file. Exit Presto. quit; Place the explain SQL into the following file. cat <<EOF >/root/ibm-lh-dev/localstorage/volumes/infra/explain.sql explain (format graphviz) select * from iceberg_data.\"workshop\".customer where mktsegment='MACHINERY'; EOF Run the Presto command to generate the explain output. ./presto-cli --catalog iceberg_data --schema workshop --file /mnt/infra/explain.sql > /tmp/plan.dot Remove Headers. cat /tmp/plan.dot | sed 's/\"\"/\"/g' | sed -z 's/\"//' | sed '$s/\"//' > /tmp/fixedplan.dot Generate the PNG file from the explain statement. dot -Tpng /tmp/fixedplan.dot > /tmp/plan.png Open a separate terminal window and issue the following command (using the SSH port number and server name supplied in your reservation). Mac OSX user scp -port watsonx@region.techzone-server.com:/tmp/plan.png plan.png && open plan.png Windows user scp -port watsonx@region.techzone-server.com:/tmp/plan.png plan.png & start \"\" \"plan.png\" Linux user (watsonx.data server) eog /tmp/plan.png Joins and Aggregations This section will create an orders table to test joins and aggregations. Start Presto CLI with Workshop Schema. ./presto-cli --catalog iceberg_data --schema workshop Create the Orders Table. create table iceberg_data.workshop.orders as select * from tpch.tiny.orders; CREATE TABLE: 15000 rows Use a Windowing function. SELECT orderkey, clerk, totalprice, rank() OVER (PARTITION BY clerk ORDER BY totalprice DESC) AS rnk FROM orders ORDER BY clerk, rnk; Try to write a window function to show the custkey, orderdate, totalprice and priororder. The output should look like this. custkey | orderdate | totalprice | priororder ---------+------------+------------+------------ 1 | 1993-06-05 | 152411.41 | NULL 1 | 1993-08-13 | 83095.85 | 152411.41 1 | 1994-05-08 | 51134.82 | 83095.85 1 | 1995-10-29 | 165928.33 | 51134.82 1 | 1997-01-29 | 231040.44 | 165928.33 1 | 1997-03-04 | 270087.44 | 231040.44 1 | 1997-06-23 | 357345.46 | 270087.44 1 | 1997-11-18 | 28599.83 | 357345.46 1 | 1998-03-29 | 89230.03 | 28599.83 2 | 1993-02-19 | 170842.93 | 89230.03 2 | 1993-05-03 | 154867.09 | 170842.93 2 | 1993-09-30 | 143707.7 | 154867.09 2 | 1994-08-15 | 116247.57 | 143707.7 2 | 1994-12-29 | 45657.87 | 116247.57 2 | 1996-03-04 | 181875.6 | 45657.87 Prepared statements Save a query as a prepared statement. prepare customer_by_segment from select * from customer where mktsegment=?; Execute prepared statement using parameters. execute customer_by_segment using 'FURNITURE'; Note : This is only valid for the active session. Quit Presto. quit;","title":"Analytic Workloads"},{"location":"wxd-analytics/#analytic-workloads","text":"Watsonx.data is based on open source PrestoDB, a distributed query engine that enables querying data stored in open file formats using open table formats for optimization and performance. Some of the characteristics which you will learn and see in action include: Compute processing is performed in memory and in parallel. Data is pipelined between query stages and over the network reducing latency overhead that one would have if disk I/O were involved.","title":"Analytic Workloads"},{"location":"wxd-analytics/#executing-and-analyzing-analytic-workloads","text":"Let us start with some simple examples of running queries and analyze the execution. We can either use the dBeaver interface or the watsonx.data CLI. We will eventually be able to use the watsonx.data console UI as well but for the moment it is under construction.","title":"Executing and analyzing analytic workloads"},{"location":"wxd-analytics/#connect-to-watsonxdata","text":"Make sure you are the root user and change to the development directory. cd /root/ibm-lh-dev/bin Open the Presto CLI. Note : The workshop schema was created as part of the introduction to Minio. If you have not run that lab, the schema will not be available. Please see the Introduction to Minio section. ./presto-cli --catalog iceberg_data --schema workshop Run a simple scan query which selects customer names and market segment. select name, mktsegment from customer limit 3; name | mktsegment --------------------+------------ Customer#000000376 | AUTOMOBILE Customer#000000377 | MACHINERY Customer#000000378 | BUILDING (3 rows) To understand the query execution plan we use the explain statement. explain select name, mktsegment from customer; - Output[name, mktsegment] => [name:varchar, mktsegment:varchar] Estimates: {rows: 1500 (15.85kB), cpu: 16230.00, memory: 0.00, network: 16230.00} - RemoteStreamingExchange[GATHER] => [name:varchar, mktsegment:varchar] Estimates: {rows: 1500 (15.85kB), cpu: 16230.00, memory: 0.00, network: 16230.00} - TableScan[TableHandle {connectorId='iceberg_data', connectorHandle='workshop.customer$data@Optional[7053670466726060568]', layout='Optional[workshop.customer$data@Optional[7053670466726060568]]'}] => [name:varchar, mktsegment:varchar] Estimates: {rows: 1500 (15.85kB), cpu: 16230.00, memory: 0.00, network: 0.00} mktsegment := 7:mktsegment:varchar (1:38) name := 2:name:varchar (1:38) What you see above is the hierarchy of logical operations to execute the query. Explain the query and focus on IO operations. explain (type io) select name, mktsegment from customer; { \"inputTableColumnInfos\" : [ { \"table\" : { \"catalog\" : \"iceberg_data\", \"schemaTable\" : { \"schema\" : \"workshop\", \"table\" : \"customer\" } }, \"columnConstraints\" : [ ] } ] } Explain physical execution plan for the query. explain (type distributed) select name, mktsegment from customer; Fragment 0 [SINGLE] Output layout: [name, mktsegment] Output partitioning: SINGLE [] Stage Execution Strategy: UNGROUPED_EXECUTION - Output[name, mktsegment] => [name:varchar, mktsegment:varchar] Estimates: {rows: 1500 (15.85kB), cpu: 16230.00, memory: 0.00, network: 16230.00} - RemoteSource[1] => [name:varchar, mktsegment:varchar] Fragment 1 [SOURCE] Output layout: [name, mktsegment] Output partitioning: SINGLE [] Stage Execution Strategy: UNGROUPED_EXECUTION - TableScan[TableHandle {connectorId='iceberg_data', connectorHandle='workshop.customer$data@Optional[7053670466726060568]', layout='Optional[workshop.customer$data@Optional[7053670466726060568]]'}, grouped = false] => [name:varchar, mktsegment:varchar] Estimates: {rows: 1500 (15.85kB), cpu: 16230.00, memory: 0.00, network: 0.00} mktsegment := 7:mktsegment:varchar (1:57) name := 2:name:varchar (1:57) A fragment represents a stage of the distributed plan. The Presto scheduler schedules the execution by each stage, and stages can be run on separate instances. Create explain statement in a visual format. explain (format graphviz) select name, mktsegment from customer; digraph logical_plan { subgraph cluster_0 { label = \"SINGLE\" plannode_1[label=\"{Output[name, mktsegment]|Estimates: \\{rows: ? (?), cpu: ?, memory: ?, network: ?\\} }\", style=\"rounded, filled\", shape=record, fillcolor=white]; plannode_2[label=\"{ExchangeNode[GATHER]|name, mktsegment|Estimates: \\{rows: ? (?), cpu: ?, memory: ?, network: ?\\} }\", style=\"rounded, filled\", shape=record, fillcolor=gold]; plannode_3[label=\"{TableScan | [TableHandle \\{connectorId='iceberg_data', connectorHandle='workshop.customer$data@Optional[7053670466726060568]', layout='Optional[workshop.customer$data@Optional[7053670466726060568]]'\\}]|Estimates: \\{rows: ? (?), cpu: ?, memory: ?, network: ?\\} }\", style=\"rounded, filled\", shape=record, fillcolor=deepskyblue]; } plannode_1 -> plannode_2; plannode_2 -> plannode_3; } We are going to format the output from the explain statement and display it as a graphic. Quit Presto. quit; Place the explain SQL into a file that will be run as a script by Presto. cat <<EOF >/root/ibm-lh-dev/localstorage/volumes/infra/explain.sql explain (format graphviz) select name, mktsegment from customer; EOF Run Presto by pointing to the file with the SQL in it. ./presto-cli --catalog iceberg_data --schema workshop --file /mnt/infra/explain.sql > /tmp/plan.dot We need to get rid of headers and stuff that Presto generated when creating the output (there is no way to turn that off). cat /tmp/plan.dot | sed 's/\"\"/\"/g' | sed -z 's/\"//' | sed '$s/\"//' > /tmp/fixedplan.dot Generate the PNG file from the explain statement. dot -Tpng /tmp/fixedplan.dot > /tmp/plan.png Open a separate terminal window and issue the following command (using the SSH port number and server name supplied in your reservation).","title":"Connect to watsonx.data"},{"location":"wxd-analytics/#mac-osx-user","text":"scp -port watsonx@region.techzone-server.com:/tmp/plan.png plan.png && open plan.png","title":"Mac OSX user"},{"location":"wxd-analytics/#windows-user","text":"scp -port watsonx@region.techzone-server.com:/tmp/plan.png plan.png & start \"\" \"plan.png\"","title":"Windows user"},{"location":"wxd-analytics/#linux-user-watsonxdata-server","text":"eog /tmp/plan.png","title":"Linux user (watsonx.data server)"},{"location":"wxd-analytics/#creating-a-table-with-user-defined-partitions","text":"Connect to Presto with the Workshop Schema. ./presto-cli --catalog iceberg_data --schema workshop Create a partitioned table, based on column mktsegment and copy data from TPCH.TINY.CUSTOMER table. create table iceberg_data.workshop.part_customer with (partitioning = array['mktsegment']) as select * from tpch.tiny.customer; Quit Presto. quit;","title":"Creating a Table with User-defined Partitions"},{"location":"wxd-analytics/#inspect-object-store-directoryobjectfile-structure","text":"Open your browser and connect to the MinIO console. If you forget the userid and password, use the following command to extract them or use the passwords command. export LH_S3_ACCESS_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_ACCESS_KEY | sed 's/.*=//') export LH_S3_SECRET_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_SECRET_KEY | sed 's/.*=//') echo \"MinIO Userid : \" $LH_S3_ACCESS_KEY echo \"MinIO Password: \" $LH_S3_SECRET_KEY Click on the Object browser tab to show the current buckets in the MinIO system. Select iceberg-bucket. You will see two tables, customer and part_customer. Select part_customer. Then select data. Examining the part_customer, you will notice is the data is split into multiple parquet files stored across multiple directories - a single directory for each unique value of the partition key.","title":"Inspect object store directory/object/file structure"},{"location":"wxd-analytics/#predicate-query-to-utilize-partitions","text":"Connect to Presto with the Workshop Schema. ./presto-cli --catalog iceberg_data --schema workshop Now that have created a partitioned table, we will execute a SQL statement that will make use of this fact. select * from iceberg_data.\"workshop\".part_customer where mktsegment='MACHINERY'; custkey | name | address | nationkey | phone | acctbal | mktsegment | comment ---------+--------------------+------------------------------------------+-----------+-----------------+---------+------------+---------------------------------------------------------------------------------------------------------------------- 1131 | Customer#000001131 | KVAvB1lwuN qHWDDPNckenmRGULDFduxYRSBXv | 20 | 30-644-540-9044 | 6019.1 | MACHINERY | er the carefully dogged courts m 1133 | Customer#000001133 | FfA0o cMP02Ylzxtmbq8DCOq | 14 | 24-858-762-2348 | 5335.36 | MACHINERY | g to the pending, ironic pinto beans. furiously blithe packages are fina 1141 | Customer#000001141 | A6uzuXpgRPp19ek8K8zd5O | 22 | 32-330-618-9020 | 0.97 | MACHINERY | accounts. furiously pending deposits cajole. c 1149 | Customer#000001149 | 5JOAwCy8MD70TUZJDyxgEBMe | 3 | 13-254-242-3889 | 6287.79 | MACHINERY | ress requests haggle carefully across the fluffily regula 1150 | Customer#000001150 | fUJqzdkQg1 | 21 | 31-236-665-8430 | -117.31 | MACHINERY | usly final dolphins. fluffily bold platelets sleep. slyly unusual attainments lo 1155 | Customer#000001155 | kEDBn1IQWyHyYjgGGs6FiXfm3 | 8 | 18-864-953-3058 | 3510.25 | MACHINERY | ages? fluffily even accounts shall have to boost furiously alongside of the furiously pendin 1158 | Customer#000001158 | btAl2dQdvNV9cEzTwVRloTb08sLYKDopV2cK,p | 10 | 20-487-747-8857 | 3081.79 | MACHINERY | theodolites use stealthy asymptotes. frets integrate even instructions. car 1161 | Customer#000001161 | QD7s2P6QpCC6g9t2aVzKg7y | 19 | 29-213-663-3342 | 591.31 | MACHINERY | ly alongside of the quickly blithe ideas. quickly ironic accounts haggle regul 1165 | Customer#000001165 | h7KTXGSqsn0 | 9 | 19-766-409-6769 | 8177.33 | MACHINERY | jole slyly beside the quickly final accounts. silent, even requests are stealthily ironic, re 1166 | Customer#000001166 | W4FAGNPKcJFebzldtNp8SehhH3 | 17 | 27-869-223-7506 | 507.26 | MACHINERY | before the platelets! carefully bold ideas lose carefully 1169 | Customer#000001169 | 04YQNIYyRRFxUnJsTP36da | 4 | 14-975-169-9356 | 7503.3 | MACHINERY | into beans doubt about the slyly ironic multipliers. carefully regular requests breach theodolites. special packages 1188 | Customer#000001188 | PtwoF3jNQ9r6 GbPIelt GvbNBuDH | 15 | 25-108-989-8154 | 3698.86 | MACHINERY | ts. quickly unusual ideas affix aft 1190 | Customer#000001190 | JwzW9OtxFRXDnVo5hXl8 2A5VxH12 | 15 | 25-538-604-9042 | 2743.63 | MACHINERY | regular deposits according to the pending packages wake blithely among the silent inst 1203 | Customer#000001203 | 9pTq4gggfKoSqQetn0yJR | 16 | 26-370-660-6154 | 5787.69 | MACHINERY | osits nag furiously final accounts. silent pack ... Many more rows Due to the partitioning of this table by mktsegment , it will completely skip scanning a large percentage of the objects in the object store. We run an explain against this query using the following command. explain (format graphviz) select * from iceberg_data.\"workshop\".customer where mktsegment='MACHINERY'; Query Plan ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- digraph logical_plan { subgraph cluster_0 { label = \"SINGLE\" plannode_1[label=\"{Output[custkey, name, address, nationkey, phone, acctbal, mktsegment, comment]|Estimates: \\{rows: 750 (56.84kB), cpu: 232830.00, memory: 0.00, network: 58207.50\\} }\", style=\"rounded, filled\", shape=record, fillcolor=white]; plannode_2[label=\"{ExchangeNode[GATHER]|custkey, name, address, nationkey, phone, acctbal, mktsegment, comment|Estimates: \\{rows: 750 (56.84kB), cpu: 232830.00, memory: 0.00, network: 58207.50\\} }\", style=\"rounded, filled\", shape=record, fillcolor=gold]; plannode_3[label=\"{Filter|(mktsegment) = (VARCHAR'MACHINERY')|Estimates: \\{rows: 750 (56.84kB), cpu: 232830.00, memory: 0.00, network: 0.00\\} }\", style=\"rounded, filled\", shape=record, fillcolor=yellow]; plannode_4[label=\"{TableScan | [TableHandle \\{connectorId='iceberg_data', connectorHandle='workshop.customer$data@Optional[7230522396120575591]', layout='Optional[workshop.customer$data@Optional[7230522396120575591]]'\\}]|Estimates: \\{rows: 1500 (113.69kB), cpu: 116415.00, memory: 0.00, network: 0.00\\} }\", style=\"rounded, filled\", shape=record, fillcolor=deepskyblue]; } plannode_1 -> plannode_2; plannode_2 -> plannode_3; plannode_3 -> plannode_4; } To visualize this, we are going to run this command and place the results into a temporary file. Exit Presto. quit; Place the explain SQL into the following file. cat <<EOF >/root/ibm-lh-dev/localstorage/volumes/infra/explain.sql explain (format graphviz) select * from iceberg_data.\"workshop\".customer where mktsegment='MACHINERY'; EOF Run the Presto command to generate the explain output. ./presto-cli --catalog iceberg_data --schema workshop --file /mnt/infra/explain.sql > /tmp/plan.dot Remove Headers. cat /tmp/plan.dot | sed 's/\"\"/\"/g' | sed -z 's/\"//' | sed '$s/\"//' > /tmp/fixedplan.dot Generate the PNG file from the explain statement. dot -Tpng /tmp/fixedplan.dot > /tmp/plan.png Open a separate terminal window and issue the following command (using the SSH port number and server name supplied in your reservation).","title":"Predicate query to utilize partitions"},{"location":"wxd-analytics/#mac-osx-user_1","text":"scp -port watsonx@region.techzone-server.com:/tmp/plan.png plan.png && open plan.png","title":"Mac OSX user"},{"location":"wxd-analytics/#windows-user_1","text":"scp -port watsonx@region.techzone-server.com:/tmp/plan.png plan.png & start \"\" \"plan.png\"","title":"Windows user"},{"location":"wxd-analytics/#linux-user-watsonxdata-server_1","text":"eog /tmp/plan.png","title":"Linux user (watsonx.data server)"},{"location":"wxd-analytics/#joins-and-aggregations","text":"This section will create an orders table to test joins and aggregations. Start Presto CLI with Workshop Schema. ./presto-cli --catalog iceberg_data --schema workshop Create the Orders Table. create table iceberg_data.workshop.orders as select * from tpch.tiny.orders; CREATE TABLE: 15000 rows Use a Windowing function. SELECT orderkey, clerk, totalprice, rank() OVER (PARTITION BY clerk ORDER BY totalprice DESC) AS rnk FROM orders ORDER BY clerk, rnk; Try to write a window function to show the custkey, orderdate, totalprice and priororder. The output should look like this. custkey | orderdate | totalprice | priororder ---------+------------+------------+------------ 1 | 1993-06-05 | 152411.41 | NULL 1 | 1993-08-13 | 83095.85 | 152411.41 1 | 1994-05-08 | 51134.82 | 83095.85 1 | 1995-10-29 | 165928.33 | 51134.82 1 | 1997-01-29 | 231040.44 | 165928.33 1 | 1997-03-04 | 270087.44 | 231040.44 1 | 1997-06-23 | 357345.46 | 270087.44 1 | 1997-11-18 | 28599.83 | 357345.46 1 | 1998-03-29 | 89230.03 | 28599.83 2 | 1993-02-19 | 170842.93 | 89230.03 2 | 1993-05-03 | 154867.09 | 170842.93 2 | 1993-09-30 | 143707.7 | 154867.09 2 | 1994-08-15 | 116247.57 | 143707.7 2 | 1994-12-29 | 45657.87 | 116247.57 2 | 1996-03-04 | 181875.6 | 45657.87","title":"Joins and Aggregations"},{"location":"wxd-analytics/#prepared-statements","text":"Save a query as a prepared statement. prepare customer_by_segment from select * from customer where mktsegment=?; Execute prepared statement using parameters. execute customer_by_segment using 'FURNITURE'; Note : This is only valid for the active session. Quit Presto. quit;","title":"Prepared statements"},{"location":"wxd-certificate/","text":"Watsonx.data Certificates Watsonx.data Certificate Failure Due to a change in TechZone URLs, the self-signed certificates in the watsonx.data Developer image may be invalid. If you are attempting to connect to the watsonx.data system from outside the virtual machine, you will need to run the following commands to fix the self-signed certificate. Step 1: Connect to the Server Use the SSH port to connect into the server and make sure that you become the root user. sudo su - Step 2: Update the Certificate We need to update the certificate by using a utility in the developer toolbox. Start the toolbox code by switching to the bin directory and issuing the following command. cd /root/ibm-lh-dev/bin ./dev-sandbox Once inside the development container, you will need to update the program that generates the certificates. Note : The certificate should cover all TechZone locations. If for some reason your TechZone server does not match the pattern *.services.cloud.techzone.ibm.com , update it in the command below. sed -i '/DNS.14.*/a DNS.15 = watsonxdata' /scripts/gen_certs.sh sed -i '/DNS.15.*/a DNS.16 = watsonxdata.gym.lan' /scripts/gen_certs.sh sed -i '/DNS.16.*/a DNS.17 = *.services.cloud.techzone.ibm.com' /scripts/gen_certs.sh ./scripts/gen_certs.sh Once the script completes, exit the toolkit. exit Step 3: Stop and Restart the System The certificates need to be replaced in all the running containers. You must stop and restart them. You must include the diagnostic flag or else the system will not work properly. The startup will take some time to complete. The Postgres pod will display some warning messages which can be safely ignored. ./stop.sh export LH_RUN_MODE = diag ./start.sh Step 4: Generate Custom Certificate The first step is to copy the new certificates to the central /certs directory use by this image. docker cp ibm-lh-presto:/mnt/infra/tls/lh-ssl-ts.jks /certs/lh-ssl-ts.jks docker cp ibm-lh-presto:/mnt/infra/tls/cert.crt /certs/lh-ssl-ts.crt Next we need to generate the certificate file that is used by a number of the examples in the lab instructions. rm -f presto.crt echo QUIT | openssl s_client -showcerts -connect 127 .0.0.1:8443 | awk '/-----BEGIN CERTIFICATE-----/ {p=1}; p; /-----END CERTIFICATE-----/ {p=0}' > presto.crt You can print the certificate if you need it for connections from CP4D. cat presto.crt Step 5: Generate Java Keystore File The next step will create the Java Keystore file. When prompted, use a password of watsonx.data and say yes to accepting the certificate. Make sure that you see your host in the list. For instance, useast.services.cloud.techzone.ibm.com should be displayed when you see the results. rm -f presto-key.jks keytool -import -alias presto-crt -file ./presto.crt -keystore ./presto-key.jks The following is an example of the output from the keytool command. Owner: CN=Dummy-Self-signed-Cert, EMAILADDRESS=dummy@example.dum, OU=For-CPD, O=Data and AI, L=Home-Town, ST=XX, C=YY Issuer: CN=Dummy-Self-signed-Cert, EMAILADDRESS=dummy@example.dum, OU=For-CPD, O=Data and AI, L=Home-Town, ST=XX, C=YY Serial number: 73f26644ad83ac8cdf9afbda6006d4e52f244fac Valid from: Tue Mar 05 17:42:56 EST 2024 until: Wed May 23 18:42:56 EDT 2035 Certificate fingerprints: SHA1: 3A:6C:52:80:3D:14:CF:D0:E7:AC:14:13:6F:46:FB:B1:8C:BA:E4:37 SHA256: 28:E7:AD:4E:BA:5F:00:4C:B7:2E:61:3E:3B:96:E5:DF:01:D5:80:CE:1A:B3:EF:B7:86:11:26:4A:B6:7C:90:8A Signature algorithm name: SHA512withRSA Subject Public Key Algorithm: 2048-bit RSA key Version: 3 Extensions: #1: ObjectId: 2.5.29.37 Criticality=false ExtendedKeyUsages [ serverAuth ] #2: ObjectId: 2.5.29.17 Criticality=false SubjectAlternativeName [ DNSName: ibm-lh-presto-svc DNSName: *.svc.cluster.local DNSName: api-svc DNSName: *.api DNSName: localhost DNSName: ibm-lh-hive-metastore DNSName: ibm-lh-hive-metastore-svc DNSName: lhconsole-api-svc DNSName: lhconsole-nodeclient-svc DNSName: ibm-lh-ranger-svc DNSName: ibm-lh-javaapi-svc DNSName: ibm-lh-prestissimo-svc DNSName: ibm-lh-qhmm DNSName: ibm-lh-qhmm-svc DNSName: *.services.cloud.techzone.ibm.com DNSName: watsonxdata ] Trust this certificate? [no]: yes Certificate was added to keystore Step 6: Create Certificate and Keystore Copies The final step is to copy the certs and keystore values in a central location so they can be used in various scripts and notebooks. \\c p -f presto-key.jks /certs \\c p -f presto.crt /certs chmod +r /certs/*.* \\c p -rf /certs /notebooks/","title":"Watsonx.data Certificates"},{"location":"wxd-certificate/#watsonxdata-certificates","text":"Watsonx.data Certificate Failure Due to a change in TechZone URLs, the self-signed certificates in the watsonx.data Developer image may be invalid. If you are attempting to connect to the watsonx.data system from outside the virtual machine, you will need to run the following commands to fix the self-signed certificate.","title":"Watsonx.data Certificates"},{"location":"wxd-certificate/#step-1-connect-to-the-server","text":"Use the SSH port to connect into the server and make sure that you become the root user. sudo su -","title":"Step 1: Connect to the Server"},{"location":"wxd-certificate/#step-2-update-the-certificate","text":"We need to update the certificate by using a utility in the developer toolbox. Start the toolbox code by switching to the bin directory and issuing the following command. cd /root/ibm-lh-dev/bin ./dev-sandbox Once inside the development container, you will need to update the program that generates the certificates. Note : The certificate should cover all TechZone locations. If for some reason your TechZone server does not match the pattern *.services.cloud.techzone.ibm.com , update it in the command below. sed -i '/DNS.14.*/a DNS.15 = watsonxdata' /scripts/gen_certs.sh sed -i '/DNS.15.*/a DNS.16 = watsonxdata.gym.lan' /scripts/gen_certs.sh sed -i '/DNS.16.*/a DNS.17 = *.services.cloud.techzone.ibm.com' /scripts/gen_certs.sh ./scripts/gen_certs.sh Once the script completes, exit the toolkit. exit","title":"Step 2: Update the Certificate"},{"location":"wxd-certificate/#step-3-stop-and-restart-the-system","text":"The certificates need to be replaced in all the running containers. You must stop and restart them. You must include the diagnostic flag or else the system will not work properly. The startup will take some time to complete. The Postgres pod will display some warning messages which can be safely ignored. ./stop.sh export LH_RUN_MODE = diag ./start.sh","title":"Step 3: Stop and Restart the System"},{"location":"wxd-certificate/#step-4-generate-custom-certificate","text":"The first step is to copy the new certificates to the central /certs directory use by this image. docker cp ibm-lh-presto:/mnt/infra/tls/lh-ssl-ts.jks /certs/lh-ssl-ts.jks docker cp ibm-lh-presto:/mnt/infra/tls/cert.crt /certs/lh-ssl-ts.crt Next we need to generate the certificate file that is used by a number of the examples in the lab instructions. rm -f presto.crt echo QUIT | openssl s_client -showcerts -connect 127 .0.0.1:8443 | awk '/-----BEGIN CERTIFICATE-----/ {p=1}; p; /-----END CERTIFICATE-----/ {p=0}' > presto.crt You can print the certificate if you need it for connections from CP4D. cat presto.crt","title":"Step 4: Generate Custom Certificate"},{"location":"wxd-certificate/#step-5-generate-java-keystore-file","text":"The next step will create the Java Keystore file. When prompted, use a password of watsonx.data and say yes to accepting the certificate. Make sure that you see your host in the list. For instance, useast.services.cloud.techzone.ibm.com should be displayed when you see the results. rm -f presto-key.jks keytool -import -alias presto-crt -file ./presto.crt -keystore ./presto-key.jks The following is an example of the output from the keytool command. Owner: CN=Dummy-Self-signed-Cert, EMAILADDRESS=dummy@example.dum, OU=For-CPD, O=Data and AI, L=Home-Town, ST=XX, C=YY Issuer: CN=Dummy-Self-signed-Cert, EMAILADDRESS=dummy@example.dum, OU=For-CPD, O=Data and AI, L=Home-Town, ST=XX, C=YY Serial number: 73f26644ad83ac8cdf9afbda6006d4e52f244fac Valid from: Tue Mar 05 17:42:56 EST 2024 until: Wed May 23 18:42:56 EDT 2035 Certificate fingerprints: SHA1: 3A:6C:52:80:3D:14:CF:D0:E7:AC:14:13:6F:46:FB:B1:8C:BA:E4:37 SHA256: 28:E7:AD:4E:BA:5F:00:4C:B7:2E:61:3E:3B:96:E5:DF:01:D5:80:CE:1A:B3:EF:B7:86:11:26:4A:B6:7C:90:8A Signature algorithm name: SHA512withRSA Subject Public Key Algorithm: 2048-bit RSA key Version: 3 Extensions: #1: ObjectId: 2.5.29.37 Criticality=false ExtendedKeyUsages [ serverAuth ] #2: ObjectId: 2.5.29.17 Criticality=false SubjectAlternativeName [ DNSName: ibm-lh-presto-svc DNSName: *.svc.cluster.local DNSName: api-svc DNSName: *.api DNSName: localhost DNSName: ibm-lh-hive-metastore DNSName: ibm-lh-hive-metastore-svc DNSName: lhconsole-api-svc DNSName: lhconsole-nodeclient-svc DNSName: ibm-lh-ranger-svc DNSName: ibm-lh-javaapi-svc DNSName: ibm-lh-prestissimo-svc DNSName: ibm-lh-qhmm DNSName: ibm-lh-qhmm-svc DNSName: *.services.cloud.techzone.ibm.com DNSName: watsonxdata ] Trust this certificate? [no]: yes Certificate was added to keystore","title":"Step 5: Generate Java Keystore File"},{"location":"wxd-certificate/#step-6-create-certificate-and-keystore-copies","text":"The final step is to copy the certs and keystore values in a central location so they can be used in various scripts and notebooks. \\c p -f presto-key.jks /certs \\c p -f presto.crt /certs chmod +r /certs/*.* \\c p -rf /certs /notebooks/","title":"Step 6: Create Certificate and Keystore Copies"},{"location":"wxd-connections/","text":"Database Connections There are three database systems that can be accessed inside and outside the virtual machine environment: watsonx.data Presto, Db2 LUW, MySQL and PostgreSQL. In order to access these images outside the Virtual machine image, you need the server name and port for the service. You will also need to download the presto-key.jks file for connecting to Presto. Connection Certificate Accessing watsonx.data (Presto) Accessing Db2 Accessing PostgreSQL Accessing MySQL Adding a database to watsonx.data Accessing watsonx.data via Python Accessing watsonx.data via Pandas Dataframes Generating a Certificate Adding a Service Watsonx.data Connection Certificate When connecting to the watsonx.data Presto database, you will need to have the connection certificate available to the client that you are using. Usually this location is your workstation, but it could be another service like CP4D. To extract the certificate to your local file system, use the following command in a terminal window. Replace the port and regions.techzone-server.com with the SSH values found in the TechZone reservation. scp -P port watsonx@region.techzone-server.com:/certs/presto-key.jks /Users/myname/Downloads Change the target directory to a location that you can remember! You can also download the certificate by using the Jupyter Notebook link and opening the Credentials notebook. There you will find links to the certificates. Watsonx.data Presto Access When connecting to the Presto engine, choose the PrestoDB driver. Presto Internal Access For local access the following credentials are used: Hostname: localhost Port: 8443 Username: ibmlhadmin Password: password Database: tpch In addition, you need to set the following driver properties: SSL True SSLTrustStorePath /certs/presto-key.jks SSLTrustStorePassword watsonx.data Presto External Access The watsonx.data Presto database requires that the certificate be extracted from the image. See the section above on Connection Certificate for more details. In the following settings, remember to update the Hostname and Port to the values provided in your TechZone reservation. The database connection settings are: Hostname: region.techzone-server.com Port: port Username: ibmlhadmin Password: password Database: tpch In addition, you need to set the following driver properties: SSL True SSLTrustStorePath /mydownload/presto-key.jks SSLTrustStorePassword watsonx.data Note : The /mydownload/presto-key.jks value needs to be replaced with the location that you copied the key in the earlier step. Db2 Access When connecting to the Db2 engine, select the Db2 LUW driver. Db2 Internal Access The Db2 server can be accessed on port 50000 inside the virtual machine using the following credentials: Hostname - watsonxdata Port - 50000 Username - db2inst1 Password - db2inst1 Database - gosales SSL - off Db2 External Access When accessing the database outside the virtual machine, you must change the host to region.techzone-server.com and the port number based on your TechZone reservation. All the other settings remain the same. Hostname - region.techzone-server.com Port - port Username - db2inst1 Password - db2inst1 Database - gosales SSL - off PostgreSQL Access When connecting to the PostgreSQL engine, select the PostgreSQL driver. In order to connect to the PostgreSQL system, you will need to extract the admin password using the following command when connected to the watsonx.data system. cat /certs/passwords You can also retrieve the credentials by opening up the Credentials notebook in the Jupyter notebook service. PostgreSQL Internal Access When accessing the PostgreSQL database in the system, use the following settings. Hostname \u2013 ibm-lh-postgres Port \u2013 5432 Username \u2013 admin Password \u2013 The value that was extracted in the earlier step Database \u2013 gosales PostgreSQL External Access The following credentials are used for remote access. Hostname: regions.techzone-server.com Port: port Username: admin Password: The value that was extracted in the earlier step Database name: gosales MySQL Access When connecting to the MySQL engine, select the MySQL driver. export POSTGRES_PASSWORD=$(docker exec ibm-lh-postgres printenv | grep POSTGRES_PASSWORD | sed 's/.*=//') echo \"Postgres Userid : admin\" echo \"Postgres Password : \" $POSTGRES_PASSWORD echo $POSTGRES_PASSWORD > /tmp/postgres.pw MySQL Internal Access When accessing the MySQL database in the system, use the following settings. Hostname \u2013 watsonxdata Port \u2013 3306 Username \u2013 root Password - password Database \u2013 gosalesdw You must set the allowPublicKeyRetrieval to True for the connection to work with dBeaver. MySQL External Access The following credentials are used for remote access. Hostname: regions.techzone-server.com Port: port Username: root Password - password Database name: gosalesdw You must set the allowPublicKeyRetrieval to True for the connection to work with dBeaver (see above). Adding a Database to watsonx.data When adding a database engine to the watsonx.data system, make sure to change the database display name since that needs to be unique. For instance, when you add gosales database from Db2 to the system, the display name could be gosales as well. However, if you now add the PostgreSQL database to the system, the display name cannot be the same. You may want to differentiate databases with the same name by prefixing them with the database type. For instance, the gosales database could be shown as db2_gosales or pg_gosales so that you keep the names distinct. Once a database has been added, make sure to wait for a few moments before attempting to access the database. The Presto server takes a few moments to start up. To make sure that it is running, run the check_presto command in a terminal window and wait until it says the service is ready. When attempting to view the contents of a new database, the process may take a few minutes to complete. Refresh the browser window if you haven't seen any changes to the display. Accessing watsonx.data via Python In order to access the watsonx.data database (Presto), you will need to install the Presto client using the following command on your local machine. pip3 install presto-python-client Once the installation is complete, extract the certificate from the watsonx.data server that we will use in the connection. scp -P port watsonx@region.techzone-server.com:/certs/presto-key.jks /Users/myname/Downloads Change the target directory to a location that you can remember! You can also download the certificate by using the Jupyter Notebook link and opening the Certificate notebook. Python and Jupyter Connection Code Your Python or Jupyter notebook code will need to import the prestodb library and then connect to watsonx.data using the connect call. import prestodb conn = prestodb.dbapi.connect( host='watsonxdata', port=8443, user='ibmlhadmin', catalog='tpch', schema='tiny', http_scheme='https', auth=prestodb.auth.BasicAuthentication(\"ibmlhadmin\", \"password\") ) conn._http_session.verify = '/certs/lh-ssl-ts.crt' cur = conn.cursor() In the above connection string, you will need to replace the following values: host - watsonxdata when connecting to the image externally, and ibm-lh-presto-svc when connecting internally catalog - What is the name of the catalog that we are accessing schema - The schema inside the catalog that will be used You also need to update the conn._http_session.verify value with the location where you downloaded the lh-ssl-ts.crt file. For internal connections, this value will be /certs/lh-ssl-ts.crt . Once connected, you can run an SQL statement and return the results. cur.execute(\"SELECT * FROM tpch.tiny.customer\") rows = cur.fetchall() The rows variable contains the answer set from the select statement. You can manipulate the row variable to view the results. rows[0] [1, 'Customer#000000001', 'IVhzIApeRb ot,c,E', 15, '25-989-741-2988', 711.56, 'BUILDING', 'to the even, regular platelets. regular, ironic epitaphs nag e'] The PrestoDB driver supports the DBAPI spec. For more details on the use of the DBAPI interface, please refer to https://peps.python.org/pep-0249/ . For instance, if you want to find the description of the columns returned, you would use the description function. cur.description [('custkey', 'bigint', None, None, None, None, None), ('name', 'varchar(25)', None, None, None, None, None), ('address', 'varchar(40)', None, None, None, None, None), ('nationkey', 'bigint', None, None, None, None, None), ('phone', 'varchar(15)', None, None, None, None, None), ('acctbal', 'double', None, None, None, None, None), ('mktsegment', 'varchar(10)', None, None, None, None, None), ('comment', 'varchar(117)', None, None, None, None, None)] Accessing watsonx.data via Pandas Dataframes The following code is required for accessing watsonx.data in Jupyter notebooks. Run the following code inside a notebook code cell. %pip install ipython-sql==0.4.1 %pip install sqlalchemy==1.4.46 %pip install sqlalchemy==1.4.46 \"pyhive[presto]\" The notebook may need a restart of the kernel to pick up the changes to the driver. If you are running in a Jupyter Lab environment, you can use the most current versions of the drivers. %pip install ipython-sql %pip install sqlalchemy %pip install sqlalchemy \"pyhive[presto]\" Once the drivers have been loaded, you will need to extract the certificate from the watsonx.data server that we will use in the connection. scp -P port watsonx@region.techzone-server.com:/certs/presto-key.jks /Users/myname/Downloads Change the target directory to a location that you can remember! You can also download the certificate by using the Jupyter Notebook link and opening the Certificate notebook. In your Jupyter notebook, you will need to import a number of libraries. import pandas as pd import sqlalchemy from sqlalchemy import create_engine Create a notebook cell which will contain all the credentials that are required to connect. Change the catalog , schema and certfile to your values. userid = \"ibmlhadmin\" password = \"password\" hostname = \"watsonxdata\" port = \"8443\" catalog = \"tpch\" schema = \"tiny\" certfile = \"/certs/lh-ssl-ts.crt\" connect_args={ 'protocol': 'https', 'requests_kwargs': {'verify': f'{certfile}'} } In the above settings, you will need to replace the following values: hostname - region.techzone-server.com when connecting to the image externally, and ibm-lh-presto-svc when connecting internally catalog - What is the name of the catalog that we are accessing schema - The schema inside the catalog that will be used You also need to update the certfile value with the location where you downloaded the lh-ssl-ts.crt file. For internal connections, this value will be /certs/lh-ssl-ts.crt . To create a connection to the database, use the following syntax. engine = create_engine( f\"presto://{userid}:{password}@{hostname}:{port}/{catalog}/{schema}\", connect_args=connect_args ) Now that you have established a connection, you can use the Pandas read_sql_query function to execute a SELECT statement against the database. mypresto = pd.read_sql_query('SELECT * from tpch.tiny.customer',engine) The variable mypresto contains the dataframe generated from the SELECT statement. mypresto You can use the features of Pandas to generate plots of the data in your notebook. First make sure you have matplotlib installed. %pip install matplotlib The following query will compute the total account balance across all nation key values. sumbynation = pd.read_sql_query('SELECT \"nationkey\", sum(\"acctbal\") from tpch.tiny.customer group by \"nationkey\" order by 2',engine) Finally, we plot the results. df.plot(kind=\"bar\", x=\"FirstName\", y=\"LastName\") plt.show() Adding a Service The watsonx.data developer edition includes two open ports which can be used to externalize a service that you create in the image. For instance, you may choose to create a MongoDB or MSSQL container using Docker and want to access this service from your own dBeaver or Mongo tooling. Since port numbers vary between different databases, the watsonx.data system provides two port numbers that can be used by your service. Open Port 1 - Server: region.techzone-services.com: Port: 12345 Open Port 2 - Server: region.techzone-services.com: Port: 23456 The internal port numbers are 10000 (Port 1) and 10001 (Port 2). The following steps are required to use these ports with your service. Open the local Firewall (Version 1.1.0 Only) Ports 10000/1 are not open by default in the 1.1.0 image. For release 1.1.1, you can skip this step. You must explicitly open ports 10000/1 with the firewall-cmd command. In a command line shell, as the root user, enter the following commands: sudo su - firewall-cmd --add-port={10000/tcp,10001/tcp} --zone=public --permanent firewall-cmd --reload You can use the following command to check that the ports are now open. firewall-cmd --list-ports Create your Service When creating your service, make sure to map the internal Docker port to either port 10000 or 10001. If you cannot remap the port, see the section on port redirection. For instance, the following command will start Microsoft SQLServer in Docker by mapping the host port 10000 to the SQLServer port of 1433 to 10000 ( -p 10000:1443 ). docker run -e \"ACCEPT_EULA=Y\" -e \"MSSQL_SA_PASSWORD=Passw0rd12345678!\" \\ -p 10000:1433 --name mssql-server --hostname mssql-server \\ -d mcr.microsoft.com/mssql/server:2019-latest You can check the port mapping with the following command. docker container ls --format \"table {{.ID}}\\t{{.Names}}\\t{{.Ports}}\" -a | grep mssql-server When creating a connection to this database using an external tool, make sure to use the port number supplied in the reservation details (Open Port 1 is for port 10000 and Open Port 2 is for port 10001). Port Redirection If you already have an existing service mapped to a different port, you can use port redirection to use either port 10000 or 10001. For instance, assume that the previous creation of the SQLServer database used port 1433. docker run -e \"ACCEPT_EULA=Y\" -e \"MSSQL_SA_PASSWORD=Passw0rd12345678!\" \\ -p 1433:1433 --name mssql-server --hostname mssql-server \\ -d mcr.microsoft.com/mssql/server:2019-latest Once the service is up and running, you can redirect the traffic on port 10000/1 to the port of your docker service. firewall-cmd --add-forward-port=port=10000:proto=tcp:toport=1433 --permanent --zone=public firewall-cmd --reload If you need to remove the redirection, use the following command. firewall-cmd --remove-forward-port=port=10000:proto=tcp:toport=1433:toaddr= --permanent --zone=public firewall-cmd --reload Accessing your Service When referring to your service from an external location, always use the port numbers that are provided for Open Port 1 or 2. Open Port 1 - Server: region.techzone-services.com: Port: 12345 Open Port 2 - Server: region.techzone-services.com: Port: 23456 Your server will be region.techzone-services.com and the port number will be either of the two port numbers provided. Remember that this port number will need to be opened in the server and a Docker mapping to the open port or a firewall port redirection will be required.","title":"Database Connections"},{"location":"wxd-connections/#database-connections","text":"There are three database systems that can be accessed inside and outside the virtual machine environment: watsonx.data Presto, Db2 LUW, MySQL and PostgreSQL. In order to access these images outside the Virtual machine image, you need the server name and port for the service. You will also need to download the presto-key.jks file for connecting to Presto. Connection Certificate Accessing watsonx.data (Presto) Accessing Db2 Accessing PostgreSQL Accessing MySQL Adding a database to watsonx.data Accessing watsonx.data via Python Accessing watsonx.data via Pandas Dataframes Generating a Certificate Adding a Service","title":"Database Connections"},{"location":"wxd-connections/#watsonxdata-connection-certificate","text":"When connecting to the watsonx.data Presto database, you will need to have the connection certificate available to the client that you are using. Usually this location is your workstation, but it could be another service like CP4D. To extract the certificate to your local file system, use the following command in a terminal window. Replace the port and regions.techzone-server.com with the SSH values found in the TechZone reservation. scp -P port watsonx@region.techzone-server.com:/certs/presto-key.jks /Users/myname/Downloads Change the target directory to a location that you can remember! You can also download the certificate by using the Jupyter Notebook link and opening the Credentials notebook. There you will find links to the certificates.","title":"Watsonx.data Connection Certificate"},{"location":"wxd-connections/#watsonxdata-presto-access","text":"When connecting to the Presto engine, choose the PrestoDB driver.","title":"Watsonx.data Presto Access"},{"location":"wxd-connections/#presto-internal-access","text":"For local access the following credentials are used: Hostname: localhost Port: 8443 Username: ibmlhadmin Password: password Database: tpch In addition, you need to set the following driver properties: SSL True SSLTrustStorePath /certs/presto-key.jks SSLTrustStorePassword watsonx.data","title":"Presto Internal Access"},{"location":"wxd-connections/#presto-external-access","text":"The watsonx.data Presto database requires that the certificate be extracted from the image. See the section above on Connection Certificate for more details. In the following settings, remember to update the Hostname and Port to the values provided in your TechZone reservation. The database connection settings are: Hostname: region.techzone-server.com Port: port Username: ibmlhadmin Password: password Database: tpch In addition, you need to set the following driver properties: SSL True SSLTrustStorePath /mydownload/presto-key.jks SSLTrustStorePassword watsonx.data Note : The /mydownload/presto-key.jks value needs to be replaced with the location that you copied the key in the earlier step.","title":"Presto External Access"},{"location":"wxd-connections/#db2-access","text":"When connecting to the Db2 engine, select the Db2 LUW driver.","title":"Db2 Access"},{"location":"wxd-connections/#db2-internal-access","text":"The Db2 server can be accessed on port 50000 inside the virtual machine using the following credentials: Hostname - watsonxdata Port - 50000 Username - db2inst1 Password - db2inst1 Database - gosales SSL - off","title":"Db2 Internal Access"},{"location":"wxd-connections/#db2-external-access","text":"When accessing the database outside the virtual machine, you must change the host to region.techzone-server.com and the port number based on your TechZone reservation. All the other settings remain the same. Hostname - region.techzone-server.com Port - port Username - db2inst1 Password - db2inst1 Database - gosales SSL - off","title":"Db2 External Access"},{"location":"wxd-connections/#postgresql-access","text":"When connecting to the PostgreSQL engine, select the PostgreSQL driver. In order to connect to the PostgreSQL system, you will need to extract the admin password using the following command when connected to the watsonx.data system. cat /certs/passwords You can also retrieve the credentials by opening up the Credentials notebook in the Jupyter notebook service.","title":"PostgreSQL Access"},{"location":"wxd-connections/#postgresql-internal-access","text":"When accessing the PostgreSQL database in the system, use the following settings. Hostname \u2013 ibm-lh-postgres Port \u2013 5432 Username \u2013 admin Password \u2013 The value that was extracted in the earlier step Database \u2013 gosales","title":"PostgreSQL Internal Access"},{"location":"wxd-connections/#postgresql-external-access","text":"The following credentials are used for remote access. Hostname: regions.techzone-server.com Port: port Username: admin Password: The value that was extracted in the earlier step Database name: gosales","title":"PostgreSQL External Access"},{"location":"wxd-connections/#mysql-access","text":"When connecting to the MySQL engine, select the MySQL driver. export POSTGRES_PASSWORD=$(docker exec ibm-lh-postgres printenv | grep POSTGRES_PASSWORD | sed 's/.*=//') echo \"Postgres Userid : admin\" echo \"Postgres Password : \" $POSTGRES_PASSWORD echo $POSTGRES_PASSWORD > /tmp/postgres.pw","title":"MySQL Access"},{"location":"wxd-connections/#mysql-internal-access","text":"When accessing the MySQL database in the system, use the following settings. Hostname \u2013 watsonxdata Port \u2013 3306 Username \u2013 root Password - password Database \u2013 gosalesdw You must set the allowPublicKeyRetrieval to True for the connection to work with dBeaver.","title":"MySQL Internal Access"},{"location":"wxd-connections/#mysql-external-access","text":"The following credentials are used for remote access. Hostname: regions.techzone-server.com Port: port Username: root Password - password Database name: gosalesdw You must set the allowPublicKeyRetrieval to True for the connection to work with dBeaver (see above).","title":"MySQL External Access"},{"location":"wxd-connections/#adding-a-database-to-watsonxdata","text":"When adding a database engine to the watsonx.data system, make sure to change the database display name since that needs to be unique. For instance, when you add gosales database from Db2 to the system, the display name could be gosales as well. However, if you now add the PostgreSQL database to the system, the display name cannot be the same. You may want to differentiate databases with the same name by prefixing them with the database type. For instance, the gosales database could be shown as db2_gosales or pg_gosales so that you keep the names distinct. Once a database has been added, make sure to wait for a few moments before attempting to access the database. The Presto server takes a few moments to start up. To make sure that it is running, run the check_presto command in a terminal window and wait until it says the service is ready. When attempting to view the contents of a new database, the process may take a few minutes to complete. Refresh the browser window if you haven't seen any changes to the display.","title":"Adding a Database to watsonx.data"},{"location":"wxd-connections/#accessing-watsonxdata-via-python","text":"In order to access the watsonx.data database (Presto), you will need to install the Presto client using the following command on your local machine. pip3 install presto-python-client Once the installation is complete, extract the certificate from the watsonx.data server that we will use in the connection. scp -P port watsonx@region.techzone-server.com:/certs/presto-key.jks /Users/myname/Downloads Change the target directory to a location that you can remember! You can also download the certificate by using the Jupyter Notebook link and opening the Certificate notebook.","title":"Accessing watsonx.data via Python"},{"location":"wxd-connections/#python-and-jupyter-connection-code","text":"Your Python or Jupyter notebook code will need to import the prestodb library and then connect to watsonx.data using the connect call. import prestodb conn = prestodb.dbapi.connect( host='watsonxdata', port=8443, user='ibmlhadmin', catalog='tpch', schema='tiny', http_scheme='https', auth=prestodb.auth.BasicAuthentication(\"ibmlhadmin\", \"password\") ) conn._http_session.verify = '/certs/lh-ssl-ts.crt' cur = conn.cursor() In the above connection string, you will need to replace the following values: host - watsonxdata when connecting to the image externally, and ibm-lh-presto-svc when connecting internally catalog - What is the name of the catalog that we are accessing schema - The schema inside the catalog that will be used You also need to update the conn._http_session.verify value with the location where you downloaded the lh-ssl-ts.crt file. For internal connections, this value will be /certs/lh-ssl-ts.crt . Once connected, you can run an SQL statement and return the results. cur.execute(\"SELECT * FROM tpch.tiny.customer\") rows = cur.fetchall() The rows variable contains the answer set from the select statement. You can manipulate the row variable to view the results. rows[0] [1, 'Customer#000000001', 'IVhzIApeRb ot,c,E', 15, '25-989-741-2988', 711.56, 'BUILDING', 'to the even, regular platelets. regular, ironic epitaphs nag e'] The PrestoDB driver supports the DBAPI spec. For more details on the use of the DBAPI interface, please refer to https://peps.python.org/pep-0249/ . For instance, if you want to find the description of the columns returned, you would use the description function. cur.description [('custkey', 'bigint', None, None, None, None, None), ('name', 'varchar(25)', None, None, None, None, None), ('address', 'varchar(40)', None, None, None, None, None), ('nationkey', 'bigint', None, None, None, None, None), ('phone', 'varchar(15)', None, None, None, None, None), ('acctbal', 'double', None, None, None, None, None), ('mktsegment', 'varchar(10)', None, None, None, None, None), ('comment', 'varchar(117)', None, None, None, None, None)]","title":"Python and Jupyter Connection Code"},{"location":"wxd-connections/#accessing-watsonxdata-via-pandas-dataframes","text":"The following code is required for accessing watsonx.data in Jupyter notebooks. Run the following code inside a notebook code cell. %pip install ipython-sql==0.4.1 %pip install sqlalchemy==1.4.46 %pip install sqlalchemy==1.4.46 \"pyhive[presto]\" The notebook may need a restart of the kernel to pick up the changes to the driver. If you are running in a Jupyter Lab environment, you can use the most current versions of the drivers. %pip install ipython-sql %pip install sqlalchemy %pip install sqlalchemy \"pyhive[presto]\" Once the drivers have been loaded, you will need to extract the certificate from the watsonx.data server that we will use in the connection. scp -P port watsonx@region.techzone-server.com:/certs/presto-key.jks /Users/myname/Downloads Change the target directory to a location that you can remember! You can also download the certificate by using the Jupyter Notebook link and opening the Certificate notebook. In your Jupyter notebook, you will need to import a number of libraries. import pandas as pd import sqlalchemy from sqlalchemy import create_engine Create a notebook cell which will contain all the credentials that are required to connect. Change the catalog , schema and certfile to your values. userid = \"ibmlhadmin\" password = \"password\" hostname = \"watsonxdata\" port = \"8443\" catalog = \"tpch\" schema = \"tiny\" certfile = \"/certs/lh-ssl-ts.crt\" connect_args={ 'protocol': 'https', 'requests_kwargs': {'verify': f'{certfile}'} } In the above settings, you will need to replace the following values: hostname - region.techzone-server.com when connecting to the image externally, and ibm-lh-presto-svc when connecting internally catalog - What is the name of the catalog that we are accessing schema - The schema inside the catalog that will be used You also need to update the certfile value with the location where you downloaded the lh-ssl-ts.crt file. For internal connections, this value will be /certs/lh-ssl-ts.crt . To create a connection to the database, use the following syntax. engine = create_engine( f\"presto://{userid}:{password}@{hostname}:{port}/{catalog}/{schema}\", connect_args=connect_args ) Now that you have established a connection, you can use the Pandas read_sql_query function to execute a SELECT statement against the database. mypresto = pd.read_sql_query('SELECT * from tpch.tiny.customer',engine) The variable mypresto contains the dataframe generated from the SELECT statement. mypresto You can use the features of Pandas to generate plots of the data in your notebook. First make sure you have matplotlib installed. %pip install matplotlib The following query will compute the total account balance across all nation key values. sumbynation = pd.read_sql_query('SELECT \"nationkey\", sum(\"acctbal\") from tpch.tiny.customer group by \"nationkey\" order by 2',engine) Finally, we plot the results. df.plot(kind=\"bar\", x=\"FirstName\", y=\"LastName\") plt.show()","title":"Accessing watsonx.data via Pandas Dataframes"},{"location":"wxd-connections/#adding-a-service","text":"The watsonx.data developer edition includes two open ports which can be used to externalize a service that you create in the image. For instance, you may choose to create a MongoDB or MSSQL container using Docker and want to access this service from your own dBeaver or Mongo tooling. Since port numbers vary between different databases, the watsonx.data system provides two port numbers that can be used by your service. Open Port 1 - Server: region.techzone-services.com: Port: 12345 Open Port 2 - Server: region.techzone-services.com: Port: 23456 The internal port numbers are 10000 (Port 1) and 10001 (Port 2). The following steps are required to use these ports with your service.","title":"Adding a Service"},{"location":"wxd-connections/#open-the-local-firewall-version-110-only","text":"Ports 10000/1 are not open by default in the 1.1.0 image. For release 1.1.1, you can skip this step. You must explicitly open ports 10000/1 with the firewall-cmd command. In a command line shell, as the root user, enter the following commands: sudo su - firewall-cmd --add-port={10000/tcp,10001/tcp} --zone=public --permanent firewall-cmd --reload You can use the following command to check that the ports are now open. firewall-cmd --list-ports","title":"Open the local Firewall (Version 1.1.0 Only)"},{"location":"wxd-connections/#create-your-service","text":"When creating your service, make sure to map the internal Docker port to either port 10000 or 10001. If you cannot remap the port, see the section on port redirection. For instance, the following command will start Microsoft SQLServer in Docker by mapping the host port 10000 to the SQLServer port of 1433 to 10000 ( -p 10000:1443 ). docker run -e \"ACCEPT_EULA=Y\" -e \"MSSQL_SA_PASSWORD=Passw0rd12345678!\" \\ -p 10000:1433 --name mssql-server --hostname mssql-server \\ -d mcr.microsoft.com/mssql/server:2019-latest You can check the port mapping with the following command. docker container ls --format \"table {{.ID}}\\t{{.Names}}\\t{{.Ports}}\" -a | grep mssql-server When creating a connection to this database using an external tool, make sure to use the port number supplied in the reservation details (Open Port 1 is for port 10000 and Open Port 2 is for port 10001).","title":"Create your Service"},{"location":"wxd-connections/#port-redirection","text":"If you already have an existing service mapped to a different port, you can use port redirection to use either port 10000 or 10001. For instance, assume that the previous creation of the SQLServer database used port 1433. docker run -e \"ACCEPT_EULA=Y\" -e \"MSSQL_SA_PASSWORD=Passw0rd12345678!\" \\ -p 1433:1433 --name mssql-server --hostname mssql-server \\ -d mcr.microsoft.com/mssql/server:2019-latest Once the service is up and running, you can redirect the traffic on port 10000/1 to the port of your docker service. firewall-cmd --add-forward-port=port=10000:proto=tcp:toport=1433 --permanent --zone=public firewall-cmd --reload If you need to remove the redirection, use the following command. firewall-cmd --remove-forward-port=port=10000:proto=tcp:toport=1433:toaddr= --permanent --zone=public firewall-cmd --reload","title":"Port Redirection"},{"location":"wxd-connections/#accessing-your-service","text":"When referring to your service from an external location, always use the port numbers that are provided for Open Port 1 or 2. Open Port 1 - Server: region.techzone-services.com: Port: 12345 Open Port 2 - Server: region.techzone-services.com: Port: 23456 Your server will be region.techzone-services.com and the port number will be either of the two port numbers provided. Remember that this port number will need to be opened in the server and a Docker mapping to the open port or a firewall port redirection will be required.","title":"Accessing your Service"},{"location":"wxd-datasets-gosales/","text":"Great Outdoors Company The Sample Outdoors Company, or GO Sales, or any variation of the Sample Outdoors name, is the name of a fictitious business operation whose sample data is used to develop sample applications for IBM\u00ae and IBM customers. Its fictitious records include sample data for sales transactions, product distribution, finance, and human resources. Any resemblance to actual names, addresses, contact numbers, or transaction values, is coincidental. Two links that provide more details on the database. Great Outdoors Company Great Outdoors Database Reference The second link will say that there is no content available, but if you click on the down arrow you will see the table names. Disclaimer The Sample Outdoors Company, or GO Sales, or any variation of the Sample Outdoors name, is the name of a fictitious business operation whose sample data is used to develop sample applications for IBM\u00ae and IBM customers. Its fictitious records include sample data for sales transactions, product distribution, finance, and human resources. Any resemblance to actual names, addresses, contact numbers, or transaction values, is coincidental. Unauthorized duplication is prohibited. Table Definitions These tables are created under the GOSALESDW schema. Thanks to Michael Schapira for generating the following ER diagram. You may need to download the image to zoom in on the relationships. DIST_INVENTORY_FACT Column Type MONTH_KEY INTEGER ORGANIZATION_KEY INTEGER BRANCH_KEY INTEGER PRODUCT_KEY INTEGER OPENING_INVENTORY INTEGER QUANTITY_SHIPPED INTEGER ADDITIONS INTEGER UNIT_COST DECIMAL CLOSING_INVENTORY INTEGER AVERAGE_UNIT_COST DECIMAL DIST_PRODUCT_FORECASE_FACT Column Type MONTH_KEY INTEGER ORGANIZATION_KEY INTEGER BASE_PRODUCT_KEY INTEGER BRANCH_KEY INTEGER UNIT_COST DECIMAL UNIT_PRICE DECIMAL EXPECTED_VOLUME INTEGER DIST_RETURNED_ITEMS_FACT Column Type DAY_KEY INTEGER ORGANIZATION_KEY INTEGER BRANCH_KEY INTEGER EMPLOYEE_KEY INTEGER RETAILER_SITE_KEY INTEGER PRODUCT_KEY INTEGER ORDER_METHOD_KEY INTEGER SALES_ORDER_KEY INTEGER RETURN_REASON_KEY INTEGER RETURN_QUANTITY INTEGER DIST_RETURN_REASON_DIM Column Type RETURN_REASON_KEY INTEGER RETURN_REASON_CODE INTEGER REASON_DESCRIPTION_EN VARCHAR EMP_EMPLOYEE_DIM Column Type EMPLOYEE_KEY INTEGER MANAGER_CODE1 INTEGER MANAGER1 VARCHAR MANAGER_MB1 VARCHAR MANAGER_CODE2 INTEGER MANAGER2 VARCHAR MANAGER_MB2 VARCHAR MANAGER_CODE3 INTEGER MANAGER3 VARCHAR MANAGER_MB3 VARCHAR MANAGER_CODE4 INTEGER MANAGER4 VARCHAR MANAGER_MB4 VARCHAR MANAGER_CODE5 INTEGER MANAGER5 VARCHAR MANAGER_MB5 VARCHAR MANAGER_CODE6 INTEGER MANAGER6 VARCHAR MANAGER_MB6 VARCHAR EMPLOYEE_CODE INTEGER EMPLOYEE_NAME VARCHAR FIRST_NAME VARCHAR LAST_NAME VARCHAR EMPLOYEE_NAME_MB VARCHAR FIRST_NAME_MB VARCHAR LAST_NAME_MB VARCHAR MANAGER_CODE INTEGER ORGANIZATION_CODE VARCHAR ADDRESS1 VARCHAR ADDRESS2 VARCHAR ADDRESS1_MB VARCHAR ADDRESS2_MB VARCHAR CITY VARCHAR CITY_MB VARCHAR PROV_STATE VARCHAR PROV_STATE_MB VARCHAR POSTAL_ZONE VARCHAR BRANCH_CODE INTEGER BIRTH_DATE DATE GENDER_CODE SMALLINT WORK_PHONE VARCHAR EXTENSION VARCHAR FAX VARCHAR EMAIL VARCHAR DATE_HIRED DATE TERMINATION_CODE INTEGER TERMINATION_DATE DATE POSITION_START_DATE DATE POSITION_CODE INTEGER EMPLOYEE_LEVEL SMALLINT ACTIVE_INDICATOR SMALLINT RECORD_START_DATE DATE RECORD_END_DATE DATE MANAGER_KEY INTEGER EMP_EXPENSE_FACT Column Type DAY_KEY INTEGER ORGANIZATION_KEY INTEGER POSITION_KEY INTEGER EMPLOYEE_KEY INTEGER EXPENSE_TYPE_KEY INTEGER ACCOUNT_KEY INTEGER EXPENSE_UNIT_QUANTITY FLOAT EXPENSE_TOTAL DECIMAL EMP_EXPENSE_PLAN_FACT Column Type MONTH_KEY INTEGER ORGANIZATION_KEY INTEGER EXPENSE_TYPE_KEY INTEGER ACCOUNT_KEY VARCHAR EXPENSE_PLAN_TOTAL DECIMAL EMP_EXPENSE_TYPE_DIM Column Type EXPENSE_TYPE_KEY INTEGER EXPENSE_GROUP_CODE INTEGER EXPENSE_TYPE_CODE INTEGER EXPENSE_UNIT_CODE INTEGER EXPENSE_GROUP_EN VARCHAR EXPENSE_TYPE_EN VARCHAR EMP_EXPENSE_UNIT_LOOKUP Column Type EXPENSE_UNIT_CODE INTEGER EXPENSE_UNIT_EN VARCHAR EMP_POSITION_DIM Column Type POSITION_KEY INTEGER POSITION_CODE1 INTEGER POSITION_CODE2 INTEGER POSITION_CODE3 INTEGER POSITION_CODE INTEGER POSITION_PARENT INTEGER MIN_SALARY DECIMAL MAX_SALARY DECIMAL PAID_HOURLY INTEGER POSITION_LEVEL SMALLINT EMP_POSITION_LOOKUP Column Type POSITION_CODE INTEGER POSITION_EN VARCHAR EMP_POSITION_SUMMARY_FACT Column Type DAY_KEY INTEGER ORGANIZATION_KEY INTEGER POSITION_KEY INTEGER POSITION_COUNT INTEGER PLANNED_POSITION_COUNT INTEGER INTERNAL_HIRES INTEGER EXTERNAL_HIRES INTEGER TERMINATIONS INTEGER EMP_RANKING_DIM Column Type EMPLOYEE_RANKING_KEY INTEGER RANKING_CODE INTEGER RANKING_DESCRIPTION_EN VARCHAR EMP_RANKING_FACT Column Type ORGANIZATION_KEY INTEGER POSITION_KEY INTEGER EMPLOYEE_KEY INTEGER DAY_KEY INTEGER EMPLOYEE_RANKING_KEY INTEGER RANKING_CODE INTEGER EMP_RECRUITMENT_DIM Column Type RECRUITMENT_MEDIUM_KEY INTEGER RECRUITMENT_MEDIUM_CODE INTEGER RECRUITMENT_TYPE_CODE INTEGER RECRUITMENT_MEDIUM_NAME_EN VARCHAR RECRUITMENT_TYPE_EN VARCHAR EMP_RECRUITMENT_FACT Column Type POST_DAY_KEY INTEGER RECRUITMENT_MEDIUM_KEY INTEGER ORGANIZATION_KEY INTEGER BRANCH_KEY INTEGER POSITION_KEY INTEGER POSITION_POSTING_DATE DATE POSITION_FILLED_DATE DATE POSITION_START_DATE DATE DAYS_TO_FILL INTEGER EMP_SUCCESSION_FACT Column Type DAY_KEY INTEGER ORGANIZATION_KEY INTEGER POSITION_KEY INTEGER EMPLOYEE_KEY INTEGER SUCCESSOR_EMPLOYEE_KEY INTEGER SUCCESSOR_POSITION_KEY INTEGER SUCCESSOR_STATUS_KEY INTEGER PERCENT_READY FLOAT TARGET_PERCENT_READY FLOAT EMP_SUCCESSION_STATUS_DIM Column Type SUCCESSOR_STATUS_KEY INTEGER SUCCESSOR_STATUS_CODE INTEGER SUCCESSOR_STATUS_EN VARCHAR EMP_SUMMARY_FACT Column Type DAY_KEY INTEGER ORGANIZATION_KEY INTEGER POSITION_KEY INTEGER EMPLOYEE_KEY INTEGER SALARY DECIMAL PAY_INCREASE FLOAT BONUS FLOAT VACATION_DAYS_TAKEN FLOAT SICK_DAYS_TAKEN FLOAT EMP_SURVEY_FACT Column Type DAY_KEY INTEGER ORGANIZATION_KEY INTEGER POSITION_KEY INTEGER EMPLOYEE_TOPIC_KEY INTEGER EMPLOYEE_TOPIC_SCORE FLOAT SATISFACTION_KEY INTEGER EMP_SURVEY_TARG_FACT Column Type MONTH_KEY INTEGER EMPLOYEE_TOPIC_KEY INTEGER EMPLOYEE_TOPIC_TARGET FLOAT EMPLOYEE_SURVEY_BENCHMARK FLOAT EMP_SURVEY_TOPIC_DIM Column Type EMPLOYEE_TOPIC_KEY INTEGER EMPLOYEE_TOPIC_CODE INTEGER EMPLOYEE_TOPIC_EN VARCHAR EMP_TERMINATION_LOOKUP Column Type TERMINATION_CODE INTEGER TERMINATION_REASON_EN VARCHAR EMP_TRAINING_DIM Column Type TRAINING_KEY INTEGER COURSE_CODE INTEGER COURSE_COST DECIMAL COURSE_DAYS FLOAT COURSE_NAME_EN VARCHAR EMP_TRAINING_FACT Column Type ORGANIZATION_KEY INTEGER POSITION_KEY INTEGER EMPLOYEE_KEY INTEGER DAY_KEY INTEGER EXPENSE_TYPE_KEY INTEGER TRAINING_KEY INTEGER COURSE_COST DECIMAL COURSE_DAYS FLOAT FIN_ACCOUNT_CLASS_LOOKUP Column Type ACCOUNT_CLASS_CODE INTEGER ACCOUNT_CLASS_EN VARCHAR FIN_ACCOUNT_DIM Column Type ACCOUNT_KEY INTEGER ACCOUNT_CODE1 VARCHAR ACCOUNT_CODE2 VARCHAR ACCOUNT_CODE3 VARCHAR ACCOUNT_CODE4 VARCHAR ACCOUNT_CODE5 VARCHAR ACCOUNT_CODE6 VARCHAR ACCOUNT_CODE7 VARCHAR ACCOUNT_CODE8 VARCHAR ACCOUNT_CODE9 VARCHAR ACCOUNT_CODE10 VARCHAR ACCOUNT_CODE11 VARCHAR ACCOUNT_CODE12 VARCHAR ACCOUNT_CODE13 VARCHAR ACCOUNT_CODE14 VARCHAR ACCOUNT_CODE15 VARCHAR ACCOUNT_CODE16 VARCHAR ACCOUNT_CODE VARCHAR ACCOUNT_PARENT VARCHAR DEBIT_OR_CREDIT CHAR(3) ACCOUNT_TYPE_CODE CHAR(3) ACCOUNT_CLASS_CODE INTEGER ACCOUNT_LEVEL INTEGER AGGREGATION_SIGN CHAR(3) FIN_ACCOUNT_NAME_LOOKUP Column Type ACCOUNT_CODE VARCHAR ACCOUNT_NAME_EN VARCHAR FIN_ACCOUNT_TYPE_LOOKUP Column Type ACCOUNT_TYPE_CODE CHAR(3) ACCOUNT_TYPE_EN VARCHAR FIN_FINANCE_FACT Column Type SUBMISSION_KEY INTEGER ORGANIZATION_KEY INTEGER ACCOUNT_KEY INTEGER MONTH_KEY INTEGER AMOUNT_YEAR_TO_DATE DECIMAL AMOUNT_MONTH DECIMAL FIN_SUBM_CURRENCY_LOOKUP Column Type SUBMISSION_CURRENCY_CODE VARCHAR SUBMISSION_CURRENCY_EN VARCHAR FIN_SUBM_DIM Column Type UBMISSION_KEY INTEGER UBMISSION_CODE VARCHAR UBMISSION_NAME_EN VARCHAR UBMISSION_YEAR INTEGER UBMISSION_TYPE_CODE CHAR(3) UBMISSION_CURRENCY_CODE VARCHAR FIN_SUBM_TYPE_LOOKUP Column Type SUBMISSION_TYPE_CODE CHAR(3) SUBMISSION_TYPE_EN VARCHAR GO_BRANCH_DIM Column Type BRANCH_KEY INTEGER BRANCH_CODE INTEGER ADDRESS1 VARCHAR ADDRESS2 VARCHAR CITY VARCHAR PROV_STATE VARCHAR ADDRESS1_MB VARCHAR ADDRESS2_MB VARCHAR CITY_MB VARCHAR PROV_STATE_MB VARCHAR POSTAL_ZONE VARCHAR COUNTRY_CODE INTEGER WAREHOUSE_BRANCH_CODE INTEGER GO_GENDER_LOOKUP Column Type GENDER_CODE SMALLINT GENDER_EN VARCHAR GO_ORG_DIM Column Type ORGANIZATION_KEY INTEGER ORGANIZATION_CODE1 VARCHAR ORGANIZATION_CODE2 VARCHAR ORGANIZATION_CODE3 VARCHAR ORGANIZATION_CODE4 VARCHAR ORGANIZATION_CODE VARCHAR ORGANIZATION_PARENT VARCHAR ORGANIZATION_LEVEL SMALLINT GO_ORG_NAME_LOOKUP Column Type ORGANIZATION_CODE VARCHAR ORGANIZATION_NAME_EN VARCHAR GO_REGION_DIM Column Type COUNTRY_KEY INTEGER COUNTRY_CODE INTEGER FLAG_IMAGE VARCHAR ISO_THREE_LETTER_CODE VARCHAR ISO_TWO_LETTER_CODE VARCHAR ISO_THREE_DIGIT_CODE VARCHAR REGION_KEY INTEGER REGION_CODE INTEGER REGION_EN VARCHAR COUNTRY_EN VARCHAR GO_SATISFACTION_DIM Column Type SATISFACTION_KEY INTEGER SATISFACTION_CODE INTEGER SATISFACTION_LOWER_LIMIT FLOAT SATISFACTION_UPPER_LIMIT FLOAT SATISFACTION_DESCRIPTION_EN VARCHAR GO_TIME_DIM Column Type DAY_KEY INTEGER DAY_DATE DATE MONTH_KEY INTEGER CURRENT_MONTH SMALLINT MONTH_NUMBER INTEGER QUARTER_KEY INTEGER CURRENT_QUARTER SMALLINT CURRENT_YEAR SMALLINT DAY_OF_WEEK SMALLINT DAY_OF_MONTH SMALLINT DAYS_IN_MONTH SMALLINT DAY_OF_YEAR SMALLINT WEEK_OF_MONTH SMALLINT WEEK_OF_QUARTER SMALLINT WEEK_OF_YEAR SMALLINT MONTH_EN VARCHAR WEEKDAY_EN VARCHAR GO_TIME_QUARTER_LOOKUP Column Type QUARTER_KEY INTEGER QUARTER_EN VARCHAR MRK_ACTIVITY_STATUS_DIM Column Type ACTIVITY_STATUS_KEY INTEGER ACTIVITY_STATUS_CODE SMALLINT ACTIVITY_STATUS_EN VARCHAR MRK_BUNDLE_GROUP_LOOKUP Column Type BUNDLE_GROUP_CODE INTEGER BUNDLE_GROUP_EN VARCHAR MRK_CAMPAIGN_LOOKUP Column Type CAMPAIGN_CODE INTEGER CAMPAIGN_NAME_EN VARCHAR MRK_PRODUCT_SURVEY_DIM Column Type PRODUCT_SURVEY_KEY INTEGER PRODUCT_TOPIC_CODE INTEGER PRODUCT_TOPIC_EN VARCHAR MRK_PRODUCT_SURVEY_FACT Column Type MONTH_KEY INTEGER ORGANIZATION_KEY INTEGER RTL_COUNTRY_KEY INTEGER BRANCH_KEY INTEGER PRODUCT_KEY INTEGER PRODUCT_SURVEY_KEY INTEGER PRODUCT_TOPIC_SCORE FLOAT MRK_PROD_SURVEY_TARG_FACT Column Type MONTH_KEY INTEGER PRODUCT_KEY INTEGER PRODUCT_SURVEY_KEY INTEGER PRODUCT_TOPIC_TARGET FLOAT MRK_PROMOTION_DIM Column Type PROMOTION_KEY INTEGER PROMOTION_CODE INTEGER CAMPAIGN_CODE INTEGER BUNDLE_GROUP_CODE INTEGER PROMOTION_NAME_EN VARCHAR MRK_PROMOTION_FACT Column Type ORGANIZATION_KEY INTEGER ORDER_DAY_KEY INTEGER RTL_COUNTRY_KEY INTEGER EMPLOYEE_KEY INTEGER RETAILER_KEY INTEGER PRODUCT_KEY INTEGER PROMOTION_KEY INTEGER SALES_ORDER_KEY INTEGER QUANTITY SMALLINT UNIT_COST DECIMAL UNIT_PRICE DECIMAL UNIT_SALE_PRICE DECIMAL GROSS_MARGIN FLOAT SALE_TOTAL DECIMAL GROSS_PROFIT DECIMAL MRK_PROMOTION_PLAN_FACT Column Type MONTH_KEY INTEGER ORGANIZATION_KEY INTEGER BRANCH_KEY INTEGER RTL_COUNTRY_KEY INTEGER PRODUCT_KEY INTEGER PROMOTION_KEY INTEGER UNIT_COST DECIMAL UNIT_PRICE DECIMAL UNIT_SALE_PRICE DECIMAL PROMOTION_PLAN_QUANTITY INTEGER PROMOTION_PLAN_REVENUE DECIMAL MRK_RTL_SURVEY_DIM Column Type RETAILER_SURVEY_KEY INTEGER RETAILER_TOPIC_CODE INTEGER RETAILER_TOPIC_EN VARCHAR MRK_RTL_SURVEY_FACT Column Type MONTH_KEY INTEGER ORGANIZATION_KEY INTEGER BRANCH_KEY INTEGER RTL_COUNTRY_KEY INTEGER RETAILER_KEY INTEGER RETAILER_SURVEY_KEY INTEGER RETAILER_TOPIC_SCORE FLOAT MRK_RTL_SURVEY_TARG_FACT Column Type RETAILER_SURVEY_KEY INTEGER MONTH_KEY INTEGER RETAILER_TOPIC_WEIGHT INTEGER RETAILER_TOPIC_TARGET FLOAT SLS_ORDER_METHOD_DIM Column Type ORDER_METHOD_KEY INTEGER ORDER_METHOD_CODE INTEGER ORDER_METHOD_EN VARCHAR SLS_PRODUCT_BRAND_LOOKUP Column Type PRODUCT_BRAND_CODE INTEGER PRODUCT_BRAND_EN VARCHAR SLS_PRODUCT_COLOR_LOOKUP Column Type PRODUCT_COLOR_CODE INTEGER PRODUCT_COLOR_EN VARCHAR SLS_PRODUCT_DIM Column Type PRODUCT_KEY INTEGER PRODUCT_LINE_CODE INTEGER PRODUCT_TYPE_KEY INTEGER PRODUCT_TYPE_CODE INTEGER PRODUCT_NUMBER INTEGER BASE_PRODUCT_KEY INTEGER BASE_PRODUCT_NUMBER INTEGER PRODUCT_COLOR_CODE INTEGER PRODUCT_SIZE_CODE INTEGER PRODUCT_BRAND_KEY INTEGER PRODUCT_BRAND_CODE INTEGER PRODUCT_IMAGE VARCHAR INTRODUCTION_DATE DATE DISCONTINUED_DATE DATE SLS_PRODUCT_LINE_LOOKUP Column Type PRODUCT_LINE_CODE INTEGER PRODUCT_LINE_EN VARCHAR SLS_PRODUCT_LOOKUP Column Type PRODUCT_NUMBER INTEGER PRODUCT_LANGUAGE VARCHAR PRODUCT_NAME VARCHAR PRODUCT_DESCRIPTION VARCHAR SLS_PRODUCT_SIZE_LOOKUP Column Type PRODUCT_SIZE_CODE INTEGER PRODUCT_SIZE_EN VARCHAR SLS_PRODUCT_TYPE_LOOKUP Column Type PRODUCT_TYPE_CODE INTEGER PRODUCT_TYPE_EN VARCHAR SLS_RTL_DIM Column Type RETAILER_SITE_KEY INTEGER RETAILER_SITE_CODE INTEGER RETAILER_KEY INTEGER RETAILER_CODE INTEGER RETAILER_NAME VARCHAR RETAILER_NAME_MB VARCHAR RETAILER_CONTACT_CODE INTEGER CONTACT_FIRST_NAME VARCHAR CONTACT_LAST_NAME VARCHAR GENDER_CODE SMALLINT CONTACT_PHONE_NUMBER VARCHAR CONTACT_EXTENSION VARCHAR CONTACT_FAX VARCHAR CONTACT_EMAIL VARCHAR RTL_ADDRESS1 VARCHAR RTL_ADDRESS2 VARCHAR RTL_CITY VARCHAR RTL_PROV_STATE VARCHAR CONTACT_FIRST_NAME_MB VARCHAR CONTACT_LAST_NAME_MB VARCHAR RTL_ADDRESS1_MB VARCHAR RTL_ADDRESS2_MB VARCHAR RTL_CITY_MB VARCHAR RTL_PROV_STATE_MB VARCHAR RTL_POSTAL_ZONE VARCHAR RTL_COUNTRY_CODE INTEGER RETAILER_START_DATE DATE RETAILER_TYPE_CODE INTEGER RETAILER_TYPE_EN VARCHAR JOB_POSITION_EN VARCHAR SLS_SALES_FACT Column Type ORDER_DAY_KEY INTEGER ORGANIZATION_KEY INTEGER EMPLOYEE_KEY INTEGER RETAILER_KEY INTEGER RETAILER_SITE_KEY INTEGER PRODUCT_KEY INTEGER PROMOTION_KEY INTEGER ORDER_METHOD_KEY INTEGER SALES_ORDER_KEY INTEGER SHIP_DAY_KEY INTEGER CLOSE_DAY_KEY INTEGER QUANTITY BIGINT UNIT_COST DECIMAL UNIT_PRICE DECIMAL UNIT_SALE_PRICE DECIMAL GROSS_MARGIN FLOAT SALE_TOTAL DECIMAL GROSS_PROFIT DECIMAL SLS_SALES_ORDER_DIM Column Type SALES_ORDER_KEY INTEGER ORDER_DETAIL_CODE INTEGER ORDER_NUMBER INTEGER WAREHOUSE_BRANCH_CODE INTEGER SLS_SALES_TARG_FACT Column Type MONTH_KEY INTEGER ORGANIZATION_KEY INTEGER RTL_COUNTRY_KEY INTEGER EMPLOYEE_KEY INTEGER RETAILER_KEY INTEGER PRODUCT_TYPE_KEY INTEGER PRODUCT_BRAND_KEY INTEGER SALES_TARGET DECIMAL","title":"Great Outdoors Company"},{"location":"wxd-datasets-gosales/#great-outdoors-company","text":"The Sample Outdoors Company, or GO Sales, or any variation of the Sample Outdoors name, is the name of a fictitious business operation whose sample data is used to develop sample applications for IBM\u00ae and IBM customers. Its fictitious records include sample data for sales transactions, product distribution, finance, and human resources. Any resemblance to actual names, addresses, contact numbers, or transaction values, is coincidental. Two links that provide more details on the database. Great Outdoors Company Great Outdoors Database Reference The second link will say that there is no content available, but if you click on the down arrow you will see the table names.","title":"Great Outdoors Company"},{"location":"wxd-datasets-gosales/#disclaimer","text":"The Sample Outdoors Company, or GO Sales, or any variation of the Sample Outdoors name, is the name of a fictitious business operation whose sample data is used to develop sample applications for IBM\u00ae and IBM customers. Its fictitious records include sample data for sales transactions, product distribution, finance, and human resources. Any resemblance to actual names, addresses, contact numbers, or transaction values, is coincidental. Unauthorized duplication is prohibited.","title":"Disclaimer"},{"location":"wxd-datasets-gosales/#table-definitions","text":"These tables are created under the GOSALESDW schema. Thanks to Michael Schapira for generating the following ER diagram. You may need to download the image to zoom in on the relationships.","title":"Table Definitions"},{"location":"wxd-datasets-gosales/#dist_inventory_fact","text":"Column Type MONTH_KEY INTEGER ORGANIZATION_KEY INTEGER BRANCH_KEY INTEGER PRODUCT_KEY INTEGER OPENING_INVENTORY INTEGER QUANTITY_SHIPPED INTEGER ADDITIONS INTEGER UNIT_COST DECIMAL CLOSING_INVENTORY INTEGER AVERAGE_UNIT_COST DECIMAL","title":"DIST_INVENTORY_FACT"},{"location":"wxd-datasets-gosales/#dist_product_forecase_fact","text":"Column Type MONTH_KEY INTEGER ORGANIZATION_KEY INTEGER BASE_PRODUCT_KEY INTEGER BRANCH_KEY INTEGER UNIT_COST DECIMAL UNIT_PRICE DECIMAL EXPECTED_VOLUME INTEGER","title":"DIST_PRODUCT_FORECASE_FACT"},{"location":"wxd-datasets-gosales/#dist_returned_items_fact","text":"Column Type DAY_KEY INTEGER ORGANIZATION_KEY INTEGER BRANCH_KEY INTEGER EMPLOYEE_KEY INTEGER RETAILER_SITE_KEY INTEGER PRODUCT_KEY INTEGER ORDER_METHOD_KEY INTEGER SALES_ORDER_KEY INTEGER RETURN_REASON_KEY INTEGER RETURN_QUANTITY INTEGER","title":"DIST_RETURNED_ITEMS_FACT"},{"location":"wxd-datasets-gosales/#dist_return_reason_dim","text":"Column Type RETURN_REASON_KEY INTEGER RETURN_REASON_CODE INTEGER REASON_DESCRIPTION_EN VARCHAR","title":"DIST_RETURN_REASON_DIM"},{"location":"wxd-datasets-gosales/#emp_employee_dim","text":"Column Type EMPLOYEE_KEY INTEGER MANAGER_CODE1 INTEGER MANAGER1 VARCHAR MANAGER_MB1 VARCHAR MANAGER_CODE2 INTEGER MANAGER2 VARCHAR MANAGER_MB2 VARCHAR MANAGER_CODE3 INTEGER MANAGER3 VARCHAR MANAGER_MB3 VARCHAR MANAGER_CODE4 INTEGER MANAGER4 VARCHAR MANAGER_MB4 VARCHAR MANAGER_CODE5 INTEGER MANAGER5 VARCHAR MANAGER_MB5 VARCHAR MANAGER_CODE6 INTEGER MANAGER6 VARCHAR MANAGER_MB6 VARCHAR EMPLOYEE_CODE INTEGER EMPLOYEE_NAME VARCHAR FIRST_NAME VARCHAR LAST_NAME VARCHAR EMPLOYEE_NAME_MB VARCHAR FIRST_NAME_MB VARCHAR LAST_NAME_MB VARCHAR MANAGER_CODE INTEGER ORGANIZATION_CODE VARCHAR ADDRESS1 VARCHAR ADDRESS2 VARCHAR ADDRESS1_MB VARCHAR ADDRESS2_MB VARCHAR CITY VARCHAR CITY_MB VARCHAR PROV_STATE VARCHAR PROV_STATE_MB VARCHAR POSTAL_ZONE VARCHAR BRANCH_CODE INTEGER BIRTH_DATE DATE GENDER_CODE SMALLINT WORK_PHONE VARCHAR EXTENSION VARCHAR FAX VARCHAR EMAIL VARCHAR DATE_HIRED DATE TERMINATION_CODE INTEGER TERMINATION_DATE DATE POSITION_START_DATE DATE POSITION_CODE INTEGER EMPLOYEE_LEVEL SMALLINT ACTIVE_INDICATOR SMALLINT RECORD_START_DATE DATE RECORD_END_DATE DATE MANAGER_KEY INTEGER","title":"EMP_EMPLOYEE_DIM"},{"location":"wxd-datasets-gosales/#emp_expense_fact","text":"Column Type DAY_KEY INTEGER ORGANIZATION_KEY INTEGER POSITION_KEY INTEGER EMPLOYEE_KEY INTEGER EXPENSE_TYPE_KEY INTEGER ACCOUNT_KEY INTEGER EXPENSE_UNIT_QUANTITY FLOAT EXPENSE_TOTAL DECIMAL","title":"EMP_EXPENSE_FACT"},{"location":"wxd-datasets-gosales/#emp_expense_plan_fact","text":"Column Type MONTH_KEY INTEGER ORGANIZATION_KEY INTEGER EXPENSE_TYPE_KEY INTEGER ACCOUNT_KEY VARCHAR EXPENSE_PLAN_TOTAL DECIMAL","title":"EMP_EXPENSE_PLAN_FACT"},{"location":"wxd-datasets-gosales/#emp_expense_type_dim","text":"Column Type EXPENSE_TYPE_KEY INTEGER EXPENSE_GROUP_CODE INTEGER EXPENSE_TYPE_CODE INTEGER EXPENSE_UNIT_CODE INTEGER EXPENSE_GROUP_EN VARCHAR EXPENSE_TYPE_EN VARCHAR","title":"EMP_EXPENSE_TYPE_DIM"},{"location":"wxd-datasets-gosales/#emp_expense_unit_lookup","text":"Column Type EXPENSE_UNIT_CODE INTEGER EXPENSE_UNIT_EN VARCHAR","title":"EMP_EXPENSE_UNIT_LOOKUP"},{"location":"wxd-datasets-gosales/#emp_position_dim","text":"Column Type POSITION_KEY INTEGER POSITION_CODE1 INTEGER POSITION_CODE2 INTEGER POSITION_CODE3 INTEGER POSITION_CODE INTEGER POSITION_PARENT INTEGER MIN_SALARY DECIMAL MAX_SALARY DECIMAL PAID_HOURLY INTEGER POSITION_LEVEL SMALLINT","title":"EMP_POSITION_DIM"},{"location":"wxd-datasets-gosales/#emp_position_lookup","text":"Column Type POSITION_CODE INTEGER POSITION_EN VARCHAR","title":"EMP_POSITION_LOOKUP"},{"location":"wxd-datasets-gosales/#emp_position_summary_fact","text":"Column Type DAY_KEY INTEGER ORGANIZATION_KEY INTEGER POSITION_KEY INTEGER POSITION_COUNT INTEGER PLANNED_POSITION_COUNT INTEGER INTERNAL_HIRES INTEGER EXTERNAL_HIRES INTEGER TERMINATIONS INTEGER","title":"EMP_POSITION_SUMMARY_FACT"},{"location":"wxd-datasets-gosales/#emp_ranking_dim","text":"Column Type EMPLOYEE_RANKING_KEY INTEGER RANKING_CODE INTEGER RANKING_DESCRIPTION_EN VARCHAR","title":"EMP_RANKING_DIM"},{"location":"wxd-datasets-gosales/#emp_ranking_fact","text":"Column Type ORGANIZATION_KEY INTEGER POSITION_KEY INTEGER EMPLOYEE_KEY INTEGER DAY_KEY INTEGER EMPLOYEE_RANKING_KEY INTEGER RANKING_CODE INTEGER","title":"EMP_RANKING_FACT"},{"location":"wxd-datasets-gosales/#emp_recruitment_dim","text":"Column Type RECRUITMENT_MEDIUM_KEY INTEGER RECRUITMENT_MEDIUM_CODE INTEGER RECRUITMENT_TYPE_CODE INTEGER RECRUITMENT_MEDIUM_NAME_EN VARCHAR RECRUITMENT_TYPE_EN VARCHAR","title":"EMP_RECRUITMENT_DIM"},{"location":"wxd-datasets-gosales/#emp_recruitment_fact","text":"Column Type POST_DAY_KEY INTEGER RECRUITMENT_MEDIUM_KEY INTEGER ORGANIZATION_KEY INTEGER BRANCH_KEY INTEGER POSITION_KEY INTEGER POSITION_POSTING_DATE DATE POSITION_FILLED_DATE DATE POSITION_START_DATE DATE DAYS_TO_FILL INTEGER","title":"EMP_RECRUITMENT_FACT"},{"location":"wxd-datasets-gosales/#emp_succession_fact","text":"Column Type DAY_KEY INTEGER ORGANIZATION_KEY INTEGER POSITION_KEY INTEGER EMPLOYEE_KEY INTEGER SUCCESSOR_EMPLOYEE_KEY INTEGER SUCCESSOR_POSITION_KEY INTEGER SUCCESSOR_STATUS_KEY INTEGER PERCENT_READY FLOAT TARGET_PERCENT_READY FLOAT","title":"EMP_SUCCESSION_FACT"},{"location":"wxd-datasets-gosales/#emp_succession_status_dim","text":"Column Type SUCCESSOR_STATUS_KEY INTEGER SUCCESSOR_STATUS_CODE INTEGER SUCCESSOR_STATUS_EN VARCHAR","title":"EMP_SUCCESSION_STATUS_DIM"},{"location":"wxd-datasets-gosales/#emp_summary_fact","text":"Column Type DAY_KEY INTEGER ORGANIZATION_KEY INTEGER POSITION_KEY INTEGER EMPLOYEE_KEY INTEGER SALARY DECIMAL PAY_INCREASE FLOAT BONUS FLOAT VACATION_DAYS_TAKEN FLOAT SICK_DAYS_TAKEN FLOAT","title":"EMP_SUMMARY_FACT"},{"location":"wxd-datasets-gosales/#emp_survey_fact","text":"Column Type DAY_KEY INTEGER ORGANIZATION_KEY INTEGER POSITION_KEY INTEGER EMPLOYEE_TOPIC_KEY INTEGER EMPLOYEE_TOPIC_SCORE FLOAT SATISFACTION_KEY INTEGER","title":"EMP_SURVEY_FACT"},{"location":"wxd-datasets-gosales/#emp_survey_targ_fact","text":"Column Type MONTH_KEY INTEGER EMPLOYEE_TOPIC_KEY INTEGER EMPLOYEE_TOPIC_TARGET FLOAT EMPLOYEE_SURVEY_BENCHMARK FLOAT","title":"EMP_SURVEY_TARG_FACT"},{"location":"wxd-datasets-gosales/#emp_survey_topic_dim","text":"Column Type EMPLOYEE_TOPIC_KEY INTEGER EMPLOYEE_TOPIC_CODE INTEGER EMPLOYEE_TOPIC_EN VARCHAR","title":"EMP_SURVEY_TOPIC_DIM"},{"location":"wxd-datasets-gosales/#emp_termination_lookup","text":"Column Type TERMINATION_CODE INTEGER TERMINATION_REASON_EN VARCHAR","title":"EMP_TERMINATION_LOOKUP"},{"location":"wxd-datasets-gosales/#emp_training_dim","text":"Column Type TRAINING_KEY INTEGER COURSE_CODE INTEGER COURSE_COST DECIMAL COURSE_DAYS FLOAT COURSE_NAME_EN VARCHAR","title":"EMP_TRAINING_DIM"},{"location":"wxd-datasets-gosales/#emp_training_fact","text":"Column Type ORGANIZATION_KEY INTEGER POSITION_KEY INTEGER EMPLOYEE_KEY INTEGER DAY_KEY INTEGER EXPENSE_TYPE_KEY INTEGER TRAINING_KEY INTEGER COURSE_COST DECIMAL COURSE_DAYS FLOAT","title":"EMP_TRAINING_FACT"},{"location":"wxd-datasets-gosales/#fin_account_class_lookup","text":"Column Type ACCOUNT_CLASS_CODE INTEGER ACCOUNT_CLASS_EN VARCHAR","title":"FIN_ACCOUNT_CLASS_LOOKUP"},{"location":"wxd-datasets-gosales/#fin_account_dim","text":"Column Type ACCOUNT_KEY INTEGER ACCOUNT_CODE1 VARCHAR ACCOUNT_CODE2 VARCHAR ACCOUNT_CODE3 VARCHAR ACCOUNT_CODE4 VARCHAR ACCOUNT_CODE5 VARCHAR ACCOUNT_CODE6 VARCHAR ACCOUNT_CODE7 VARCHAR ACCOUNT_CODE8 VARCHAR ACCOUNT_CODE9 VARCHAR ACCOUNT_CODE10 VARCHAR ACCOUNT_CODE11 VARCHAR ACCOUNT_CODE12 VARCHAR ACCOUNT_CODE13 VARCHAR ACCOUNT_CODE14 VARCHAR ACCOUNT_CODE15 VARCHAR ACCOUNT_CODE16 VARCHAR ACCOUNT_CODE VARCHAR ACCOUNT_PARENT VARCHAR DEBIT_OR_CREDIT CHAR(3) ACCOUNT_TYPE_CODE CHAR(3) ACCOUNT_CLASS_CODE INTEGER ACCOUNT_LEVEL INTEGER AGGREGATION_SIGN CHAR(3)","title":"FIN_ACCOUNT_DIM"},{"location":"wxd-datasets-gosales/#fin_account_name_lookup","text":"Column Type ACCOUNT_CODE VARCHAR ACCOUNT_NAME_EN VARCHAR","title":"FIN_ACCOUNT_NAME_LOOKUP"},{"location":"wxd-datasets-gosales/#fin_account_type_lookup","text":"Column Type ACCOUNT_TYPE_CODE CHAR(3) ACCOUNT_TYPE_EN VARCHAR","title":"FIN_ACCOUNT_TYPE_LOOKUP"},{"location":"wxd-datasets-gosales/#fin_finance_fact","text":"Column Type SUBMISSION_KEY INTEGER ORGANIZATION_KEY INTEGER ACCOUNT_KEY INTEGER MONTH_KEY INTEGER AMOUNT_YEAR_TO_DATE DECIMAL AMOUNT_MONTH DECIMAL","title":"FIN_FINANCE_FACT"},{"location":"wxd-datasets-gosales/#fin_subm_currency_lookup","text":"Column Type SUBMISSION_CURRENCY_CODE VARCHAR SUBMISSION_CURRENCY_EN VARCHAR","title":"FIN_SUBM_CURRENCY_LOOKUP"},{"location":"wxd-datasets-gosales/#fin_subm_dim","text":"Column Type UBMISSION_KEY INTEGER UBMISSION_CODE VARCHAR UBMISSION_NAME_EN VARCHAR UBMISSION_YEAR INTEGER UBMISSION_TYPE_CODE CHAR(3) UBMISSION_CURRENCY_CODE VARCHAR","title":"FIN_SUBM_DIM"},{"location":"wxd-datasets-gosales/#fin_subm_type_lookup","text":"Column Type SUBMISSION_TYPE_CODE CHAR(3) SUBMISSION_TYPE_EN VARCHAR","title":"FIN_SUBM_TYPE_LOOKUP"},{"location":"wxd-datasets-gosales/#go_branch_dim","text":"Column Type BRANCH_KEY INTEGER BRANCH_CODE INTEGER ADDRESS1 VARCHAR ADDRESS2 VARCHAR CITY VARCHAR PROV_STATE VARCHAR ADDRESS1_MB VARCHAR ADDRESS2_MB VARCHAR CITY_MB VARCHAR PROV_STATE_MB VARCHAR POSTAL_ZONE VARCHAR COUNTRY_CODE INTEGER WAREHOUSE_BRANCH_CODE INTEGER","title":"GO_BRANCH_DIM"},{"location":"wxd-datasets-gosales/#go_gender_lookup","text":"Column Type GENDER_CODE SMALLINT GENDER_EN VARCHAR","title":"GO_GENDER_LOOKUP"},{"location":"wxd-datasets-gosales/#go_org_dim","text":"Column Type ORGANIZATION_KEY INTEGER ORGANIZATION_CODE1 VARCHAR ORGANIZATION_CODE2 VARCHAR ORGANIZATION_CODE3 VARCHAR ORGANIZATION_CODE4 VARCHAR ORGANIZATION_CODE VARCHAR ORGANIZATION_PARENT VARCHAR ORGANIZATION_LEVEL SMALLINT","title":"GO_ORG_DIM"},{"location":"wxd-datasets-gosales/#go_org_name_lookup","text":"Column Type ORGANIZATION_CODE VARCHAR ORGANIZATION_NAME_EN VARCHAR","title":"GO_ORG_NAME_LOOKUP"},{"location":"wxd-datasets-gosales/#go_region_dim","text":"Column Type COUNTRY_KEY INTEGER COUNTRY_CODE INTEGER FLAG_IMAGE VARCHAR ISO_THREE_LETTER_CODE VARCHAR ISO_TWO_LETTER_CODE VARCHAR ISO_THREE_DIGIT_CODE VARCHAR REGION_KEY INTEGER REGION_CODE INTEGER REGION_EN VARCHAR COUNTRY_EN VARCHAR","title":"GO_REGION_DIM"},{"location":"wxd-datasets-gosales/#go_satisfaction_dim","text":"Column Type SATISFACTION_KEY INTEGER SATISFACTION_CODE INTEGER SATISFACTION_LOWER_LIMIT FLOAT SATISFACTION_UPPER_LIMIT FLOAT SATISFACTION_DESCRIPTION_EN VARCHAR","title":"GO_SATISFACTION_DIM"},{"location":"wxd-datasets-gosales/#go_time_dim","text":"Column Type DAY_KEY INTEGER DAY_DATE DATE MONTH_KEY INTEGER CURRENT_MONTH SMALLINT MONTH_NUMBER INTEGER QUARTER_KEY INTEGER CURRENT_QUARTER SMALLINT CURRENT_YEAR SMALLINT DAY_OF_WEEK SMALLINT DAY_OF_MONTH SMALLINT DAYS_IN_MONTH SMALLINT DAY_OF_YEAR SMALLINT WEEK_OF_MONTH SMALLINT WEEK_OF_QUARTER SMALLINT WEEK_OF_YEAR SMALLINT MONTH_EN VARCHAR WEEKDAY_EN VARCHAR","title":"GO_TIME_DIM"},{"location":"wxd-datasets-gosales/#go_time_quarter_lookup","text":"Column Type QUARTER_KEY INTEGER QUARTER_EN VARCHAR","title":"GO_TIME_QUARTER_LOOKUP"},{"location":"wxd-datasets-gosales/#mrk_activity_status_dim","text":"Column Type ACTIVITY_STATUS_KEY INTEGER ACTIVITY_STATUS_CODE SMALLINT ACTIVITY_STATUS_EN VARCHAR","title":"MRK_ACTIVITY_STATUS_DIM"},{"location":"wxd-datasets-gosales/#mrk_bundle_group_lookup","text":"Column Type BUNDLE_GROUP_CODE INTEGER BUNDLE_GROUP_EN VARCHAR","title":"MRK_BUNDLE_GROUP_LOOKUP"},{"location":"wxd-datasets-gosales/#mrk_campaign_lookup","text":"Column Type CAMPAIGN_CODE INTEGER CAMPAIGN_NAME_EN VARCHAR","title":"MRK_CAMPAIGN_LOOKUP"},{"location":"wxd-datasets-gosales/#mrk_product_survey_dim","text":"Column Type PRODUCT_SURVEY_KEY INTEGER PRODUCT_TOPIC_CODE INTEGER PRODUCT_TOPIC_EN VARCHAR","title":"MRK_PRODUCT_SURVEY_DIM"},{"location":"wxd-datasets-gosales/#mrk_product_survey_fact","text":"Column Type MONTH_KEY INTEGER ORGANIZATION_KEY INTEGER RTL_COUNTRY_KEY INTEGER BRANCH_KEY INTEGER PRODUCT_KEY INTEGER PRODUCT_SURVEY_KEY INTEGER PRODUCT_TOPIC_SCORE FLOAT","title":"MRK_PRODUCT_SURVEY_FACT"},{"location":"wxd-datasets-gosales/#mrk_prod_survey_targ_fact","text":"Column Type MONTH_KEY INTEGER PRODUCT_KEY INTEGER PRODUCT_SURVEY_KEY INTEGER PRODUCT_TOPIC_TARGET FLOAT","title":"MRK_PROD_SURVEY_TARG_FACT"},{"location":"wxd-datasets-gosales/#mrk_promotion_dim","text":"Column Type PROMOTION_KEY INTEGER PROMOTION_CODE INTEGER CAMPAIGN_CODE INTEGER BUNDLE_GROUP_CODE INTEGER PROMOTION_NAME_EN VARCHAR","title":"MRK_PROMOTION_DIM"},{"location":"wxd-datasets-gosales/#mrk_promotion_fact","text":"Column Type ORGANIZATION_KEY INTEGER ORDER_DAY_KEY INTEGER RTL_COUNTRY_KEY INTEGER EMPLOYEE_KEY INTEGER RETAILER_KEY INTEGER PRODUCT_KEY INTEGER PROMOTION_KEY INTEGER SALES_ORDER_KEY INTEGER QUANTITY SMALLINT UNIT_COST DECIMAL UNIT_PRICE DECIMAL UNIT_SALE_PRICE DECIMAL GROSS_MARGIN FLOAT SALE_TOTAL DECIMAL GROSS_PROFIT DECIMAL","title":"MRK_PROMOTION_FACT"},{"location":"wxd-datasets-gosales/#mrk_promotion_plan_fact","text":"Column Type MONTH_KEY INTEGER ORGANIZATION_KEY INTEGER BRANCH_KEY INTEGER RTL_COUNTRY_KEY INTEGER PRODUCT_KEY INTEGER PROMOTION_KEY INTEGER UNIT_COST DECIMAL UNIT_PRICE DECIMAL UNIT_SALE_PRICE DECIMAL PROMOTION_PLAN_QUANTITY INTEGER PROMOTION_PLAN_REVENUE DECIMAL","title":"MRK_PROMOTION_PLAN_FACT"},{"location":"wxd-datasets-gosales/#mrk_rtl_survey_dim","text":"Column Type RETAILER_SURVEY_KEY INTEGER RETAILER_TOPIC_CODE INTEGER RETAILER_TOPIC_EN VARCHAR","title":"MRK_RTL_SURVEY_DIM"},{"location":"wxd-datasets-gosales/#mrk_rtl_survey_fact","text":"Column Type MONTH_KEY INTEGER ORGANIZATION_KEY INTEGER BRANCH_KEY INTEGER RTL_COUNTRY_KEY INTEGER RETAILER_KEY INTEGER RETAILER_SURVEY_KEY INTEGER RETAILER_TOPIC_SCORE FLOAT","title":"MRK_RTL_SURVEY_FACT"},{"location":"wxd-datasets-gosales/#mrk_rtl_survey_targ_fact","text":"Column Type RETAILER_SURVEY_KEY INTEGER MONTH_KEY INTEGER RETAILER_TOPIC_WEIGHT INTEGER RETAILER_TOPIC_TARGET FLOAT","title":"MRK_RTL_SURVEY_TARG_FACT"},{"location":"wxd-datasets-gosales/#sls_order_method_dim","text":"Column Type ORDER_METHOD_KEY INTEGER ORDER_METHOD_CODE INTEGER ORDER_METHOD_EN VARCHAR","title":"SLS_ORDER_METHOD_DIM"},{"location":"wxd-datasets-gosales/#sls_product_brand_lookup","text":"Column Type PRODUCT_BRAND_CODE INTEGER PRODUCT_BRAND_EN VARCHAR","title":"SLS_PRODUCT_BRAND_LOOKUP"},{"location":"wxd-datasets-gosales/#sls_product_color_lookup","text":"Column Type PRODUCT_COLOR_CODE INTEGER PRODUCT_COLOR_EN VARCHAR","title":"SLS_PRODUCT_COLOR_LOOKUP"},{"location":"wxd-datasets-gosales/#sls_product_dim","text":"Column Type PRODUCT_KEY INTEGER PRODUCT_LINE_CODE INTEGER PRODUCT_TYPE_KEY INTEGER PRODUCT_TYPE_CODE INTEGER PRODUCT_NUMBER INTEGER BASE_PRODUCT_KEY INTEGER BASE_PRODUCT_NUMBER INTEGER PRODUCT_COLOR_CODE INTEGER PRODUCT_SIZE_CODE INTEGER PRODUCT_BRAND_KEY INTEGER PRODUCT_BRAND_CODE INTEGER PRODUCT_IMAGE VARCHAR INTRODUCTION_DATE DATE DISCONTINUED_DATE DATE","title":"SLS_PRODUCT_DIM"},{"location":"wxd-datasets-gosales/#sls_product_line_lookup","text":"Column Type PRODUCT_LINE_CODE INTEGER PRODUCT_LINE_EN VARCHAR","title":"SLS_PRODUCT_LINE_LOOKUP"},{"location":"wxd-datasets-gosales/#sls_product_lookup","text":"Column Type PRODUCT_NUMBER INTEGER PRODUCT_LANGUAGE VARCHAR PRODUCT_NAME VARCHAR PRODUCT_DESCRIPTION VARCHAR","title":"SLS_PRODUCT_LOOKUP"},{"location":"wxd-datasets-gosales/#sls_product_size_lookup","text":"Column Type PRODUCT_SIZE_CODE INTEGER PRODUCT_SIZE_EN VARCHAR","title":"SLS_PRODUCT_SIZE_LOOKUP"},{"location":"wxd-datasets-gosales/#sls_product_type_lookup","text":"Column Type PRODUCT_TYPE_CODE INTEGER PRODUCT_TYPE_EN VARCHAR","title":"SLS_PRODUCT_TYPE_LOOKUP"},{"location":"wxd-datasets-gosales/#sls_rtl_dim","text":"Column Type RETAILER_SITE_KEY INTEGER RETAILER_SITE_CODE INTEGER RETAILER_KEY INTEGER RETAILER_CODE INTEGER RETAILER_NAME VARCHAR RETAILER_NAME_MB VARCHAR RETAILER_CONTACT_CODE INTEGER CONTACT_FIRST_NAME VARCHAR CONTACT_LAST_NAME VARCHAR GENDER_CODE SMALLINT CONTACT_PHONE_NUMBER VARCHAR CONTACT_EXTENSION VARCHAR CONTACT_FAX VARCHAR CONTACT_EMAIL VARCHAR RTL_ADDRESS1 VARCHAR RTL_ADDRESS2 VARCHAR RTL_CITY VARCHAR RTL_PROV_STATE VARCHAR CONTACT_FIRST_NAME_MB VARCHAR CONTACT_LAST_NAME_MB VARCHAR RTL_ADDRESS1_MB VARCHAR RTL_ADDRESS2_MB VARCHAR RTL_CITY_MB VARCHAR RTL_PROV_STATE_MB VARCHAR RTL_POSTAL_ZONE VARCHAR RTL_COUNTRY_CODE INTEGER RETAILER_START_DATE DATE RETAILER_TYPE_CODE INTEGER RETAILER_TYPE_EN VARCHAR JOB_POSITION_EN VARCHAR","title":"SLS_RTL_DIM"},{"location":"wxd-datasets-gosales/#sls_sales_fact","text":"Column Type ORDER_DAY_KEY INTEGER ORGANIZATION_KEY INTEGER EMPLOYEE_KEY INTEGER RETAILER_KEY INTEGER RETAILER_SITE_KEY INTEGER PRODUCT_KEY INTEGER PROMOTION_KEY INTEGER ORDER_METHOD_KEY INTEGER SALES_ORDER_KEY INTEGER SHIP_DAY_KEY INTEGER CLOSE_DAY_KEY INTEGER QUANTITY BIGINT UNIT_COST DECIMAL UNIT_PRICE DECIMAL UNIT_SALE_PRICE DECIMAL GROSS_MARGIN FLOAT SALE_TOTAL DECIMAL GROSS_PROFIT DECIMAL","title":"SLS_SALES_FACT"},{"location":"wxd-datasets-gosales/#sls_sales_order_dim","text":"Column Type SALES_ORDER_KEY INTEGER ORDER_DETAIL_CODE INTEGER ORDER_NUMBER INTEGER WAREHOUSE_BRANCH_CODE INTEGER","title":"SLS_SALES_ORDER_DIM"},{"location":"wxd-datasets-gosales/#sls_sales_targ_fact","text":"Column Type MONTH_KEY INTEGER ORGANIZATION_KEY INTEGER RTL_COUNTRY_KEY INTEGER EMPLOYEE_KEY INTEGER RETAILER_KEY INTEGER PRODUCT_TYPE_KEY INTEGER PRODUCT_BRAND_KEY INTEGER SALES_TARGET DECIMAL","title":"SLS_SALES_TARG_FACT"},{"location":"wxd-datasets-intro/","text":"Datasets There are three datasets that have been loaded into watsonx.data system for you to use while exploring the features of the product. Great Outdoors Company warehouse data Airline On-Time performance Taxi fares Data Location The datasets found above have already been preloaded into the system, so there is no need to run the scripts below unless you want to modify the schemas or location of the data. The data files can be found in the /sampledata directory. Underneath this directory you will find datasets in three different formats: Parquet - Data that has been formatted in Parquet format that can be loaded directly into Hive and queried by watsonx.data. Relational - Data that is in a delimited format that can be loaded into Db2 or PostgreSQL databases. CSV - Comma separated values that can be converted to multiple formats or used by watsonx.data. Within the Parquet and Relational directories are SQL statements that can be used to catalog and load the data into the different systems.","title":"Dataset Overview"},{"location":"wxd-datasets-intro/#datasets","text":"There are three datasets that have been loaded into watsonx.data system for you to use while exploring the features of the product. Great Outdoors Company warehouse data Airline On-Time performance Taxi fares","title":"Datasets"},{"location":"wxd-datasets-intro/#data-location","text":"The datasets found above have already been preloaded into the system, so there is no need to run the scripts below unless you want to modify the schemas or location of the data. The data files can be found in the /sampledata directory. Underneath this directory you will find datasets in three different formats: Parquet - Data that has been formatted in Parquet format that can be loaded directly into Hive and queried by watsonx.data. Relational - Data that is in a delimited format that can be loaded into Db2 or PostgreSQL databases. CSV - Comma separated values that can be converted to multiple formats or used by watsonx.data. Within the Parquet and Relational directories are SQL statements that can be used to catalog and load the data into the different systems.","title":"Data Location"},{"location":"wxd-datasets-load/","text":"External Datasets There are a variety of data sets available for you to load from external sites. Check out the following websites for a variety of public data sets that you can use. Awesome Public Datasets Kaggle Datasets US Data.Gov UCI Machine Learning Repository US Fuel Economy Note : These sites have not been checked for license restrictions on the use of the data. You are responsible for checking that the data can be used without any licensing requirements. Loading your own data You can use a browser or link to an external file repository (i.e., Box) and download data directly to your workstation. Data can be CSV, Parquet, JSON, or TXT formats. Once the data is on your workstation, use the following steps. Note : You cannot import customer data nor any data that has restrictions associated with its use. Any use of private data is in violation of the terms and conditions of using this image. The first step is to connect to MinIO. Extract the MinIO credentials by using the passwords command: passwords Open your browser and navigate to the MinIO console. Login with object store credentials found above (These will be different for your system). You should see the current buckets in MinIO. If you don't see the buckets, click on Object Browser on the left-hand side panel. Select hive-bucket from the list of buckets. You may see other directories in this list than what is shown above. You will need to create a new path for your data set. Create a new directory name for your data (fuel_economy was used for this example). MinIO will display an empty directory and suggest you load something into it. Use the Upload button on the far right side to point to your dataset on your local machine. In this example, we are using a CSV file for the 2013 fuel economy estimates for automobiles sold in the US. You may need to rename you datasets to eliminate blanks and any other special characters other than \" _ \" or \" - \". The display will show the progress of the upload into the bucket. You may need to refresh your browser to see the file in the bucket. Now that the data has been loaded into a bucket, you can catalog it in the watsonx.data UI. If you created a new bucket for this data set, you will need to register it first in the watsonx.data UI. Instructions for how to do this are found in the Working with Object Store Buckets section. Start by navigating to the watsonx.data UI and look at the Infrastructure manager. Find the bucket where you upload your data set into and note the catalog name that it is associated with. Here we can see that the hive-bucket bucket is associated with the hive_data catalog. In the watsonx.data UI, select the Query workspace (SQL) icon. You will need to create a schema that links to this data set. The format of the command is shown below. DROP SCHEMA catalog.schema_name; CREATE SCHEMA catalog.schema_name WITH ( location = 's3a://your-bucket/data_directory' ); You will need to change the following values: catalog - The catalog that the bucket you are using is associated with schema_name - A schema name to associate your tables with data_directory - The directory in which your file is located your_bucket - The bucket the data physically resides in For the fuel economy example, using mpg as the schema, the SQL would be: DROP SCHEMA hive_data.mpg; CREATE SCHEMA hive_data.mpg WITH ( location = 's3a://hive-bucket/fuel_economy' ); Run this SQL against the Presto engine: The DROP command may fail if the schema doesn't exist, but the CREATE should work. The next step is to define what the table looks like for watsonx.data to be able to query it. The syntax of the CREATE TABLE statement is similar to: CREATE TABLE catalog.schema.tablename ( \"column_name\" \"type\", ... ) WITH ( format = 'CSV', csv_separator = ',', external_location = 's3a://your_bucket/data_directory'); You will need to create a table definition for your CSV file in order to catalog it in watsonx.data. Note that the only data type that is permitted for CSV columns is varchar . This is a restriction of the current driver. Plans are to update it to include other data types over time. If your data set does not include a header row (a row that defines the column names), you will need to create the table definition manually. If the data set does contain a header record, you can use the following Python code to generate a CREATE TABLE statement. You will need to make sure that pandas is available. python3 -m pip install pandas --user Next run the python3 command in the shell to run an interactive Python session. python3 Place the following code into your Python window. def showcsv(catalog, schema, tablename, bucket, directory, csv_in): import pandas as pd df = pd.read_csv(csv_in,na_values=\"-\") df = df.fillna(0) column_headers = list(df.columns.values) print(\"\") print(f\"DROP TABLE IF EXISTS {catalog}.{schema}.{tablename};\") print(f\"CREATE TABLE {catalog}.{schema}.{tablename}\") print(\" (\") comma = \"\" end = \"\" for header in column_headers: print(f\"{comma}\",end=end) comma = \",\" end = \"\\n\" print(f' \"{header}\" varchar',end=\"\") print(f\" )\") print(f\"WITH (\") print(f\" format = 'CSV',\") print(f\" csv_separator = ',',\") print(f\" external_location = 's3a://{bucket}/{directory}'\") print(f\" );\") print(\"\") def makesql(): catalog = input(\"Catalog : \") schema = input(\"Schema : \") table = input(\"Table : \") bucket = input(\"Bucket : \") dir = input(\"Directory : \") csv = input(\"CSV File : \") showcsv(catalog,schema,table,bucket,dir,csv) Gather the following information on your dataset: catalog - The catalog the schema and table are created under ( hive_data ) schema - The schema name that you created to hold your table ( mpg ) table name - The name of the table ( fuel_economy ) bucket - Where the data is located ( hive-bucket ) directory - What directory contains your data ( fuel_economy ) csv_in - The location on your local machine where the csv file is Once you have gathered that, run the following command in your Python window and answer the prompts. makesql() >>> makesql() Catalog : hive_data Schema : mpg Table : fueleconomy Bucket : hive-bucket Directory : fuel_economy CSV File : ~/Downloads/fuel_economy_2013.csv DROP TABLE IF EXISTS hive_data.mpg.fueleconomy; CREATE TABLE hive_data.mpg.fueleconomy ( \"MODEL_YEAR\" varchar, \"MFR_NAME\" varchar, \"DIVISION\" varchar, \"CARLINE\" varchar, \"ENG_DISPL\" varchar, \"CYL\" varchar, \"TRANS_IN_FE_GUIDE\" varchar, \"CITY_FE_CONVENTIONAL_FUEL\" varchar, \"HWY_FE_CONVENTIONAL_FUEL\" varchar, \"COMB_FE_CONVENTIONAL_FUEL\" varchar, \"AIR_ASPIRATION_DESC\" varchar, \"TRANS_DESC\" varchar, \"GEARS\" varchar, \"DRIVE_DESC\" varchar, \"FUEL_UNIT_CONVENTIONAL_FUEL\" varchar, \"FUEL_UNIT_DESC_CONVENTIONAL_FUEL\" varchar, \"ANNUAL_FUEL_COST_CONVENTIONAL\" varchar, \"FUEL_METERING_SYS_DESC\" varchar ) WITH ( format = 'CSV', csv_separator = ',', external_location = 's3a://hive-bucket/fuel_economy' ); Cut and paste the output from the command into the watsonx.data Data Explorer window to create the file. Now you can query the table with the following SQL. Note that the header record still exists in the answer set since we did not remove it from the CSV file. SELECT * FROM hive_data.mpg.fueleconomy LIMIT 10","title":"Loading External Datasets"},{"location":"wxd-datasets-load/#external-datasets","text":"There are a variety of data sets available for you to load from external sites. Check out the following websites for a variety of public data sets that you can use. Awesome Public Datasets Kaggle Datasets US Data.Gov UCI Machine Learning Repository US Fuel Economy Note : These sites have not been checked for license restrictions on the use of the data. You are responsible for checking that the data can be used without any licensing requirements.","title":"External Datasets"},{"location":"wxd-datasets-load/#loading-your-own-data","text":"You can use a browser or link to an external file repository (i.e., Box) and download data directly to your workstation. Data can be CSV, Parquet, JSON, or TXT formats. Once the data is on your workstation, use the following steps. Note : You cannot import customer data nor any data that has restrictions associated with its use. Any use of private data is in violation of the terms and conditions of using this image. The first step is to connect to MinIO. Extract the MinIO credentials by using the passwords command: passwords Open your browser and navigate to the MinIO console. Login with object store credentials found above (These will be different for your system). You should see the current buckets in MinIO. If you don't see the buckets, click on Object Browser on the left-hand side panel. Select hive-bucket from the list of buckets. You may see other directories in this list than what is shown above. You will need to create a new path for your data set. Create a new directory name for your data (fuel_economy was used for this example). MinIO will display an empty directory and suggest you load something into it. Use the Upload button on the far right side to point to your dataset on your local machine. In this example, we are using a CSV file for the 2013 fuel economy estimates for automobiles sold in the US. You may need to rename you datasets to eliminate blanks and any other special characters other than \" _ \" or \" - \". The display will show the progress of the upload into the bucket. You may need to refresh your browser to see the file in the bucket. Now that the data has been loaded into a bucket, you can catalog it in the watsonx.data UI. If you created a new bucket for this data set, you will need to register it first in the watsonx.data UI. Instructions for how to do this are found in the Working with Object Store Buckets section. Start by navigating to the watsonx.data UI and look at the Infrastructure manager. Find the bucket where you upload your data set into and note the catalog name that it is associated with. Here we can see that the hive-bucket bucket is associated with the hive_data catalog. In the watsonx.data UI, select the Query workspace (SQL) icon. You will need to create a schema that links to this data set. The format of the command is shown below. DROP SCHEMA catalog.schema_name; CREATE SCHEMA catalog.schema_name WITH ( location = 's3a://your-bucket/data_directory' ); You will need to change the following values: catalog - The catalog that the bucket you are using is associated with schema_name - A schema name to associate your tables with data_directory - The directory in which your file is located your_bucket - The bucket the data physically resides in For the fuel economy example, using mpg as the schema, the SQL would be: DROP SCHEMA hive_data.mpg; CREATE SCHEMA hive_data.mpg WITH ( location = 's3a://hive-bucket/fuel_economy' ); Run this SQL against the Presto engine: The DROP command may fail if the schema doesn't exist, but the CREATE should work. The next step is to define what the table looks like for watsonx.data to be able to query it. The syntax of the CREATE TABLE statement is similar to: CREATE TABLE catalog.schema.tablename ( \"column_name\" \"type\", ... ) WITH ( format = 'CSV', csv_separator = ',', external_location = 's3a://your_bucket/data_directory'); You will need to create a table definition for your CSV file in order to catalog it in watsonx.data. Note that the only data type that is permitted for CSV columns is varchar . This is a restriction of the current driver. Plans are to update it to include other data types over time. If your data set does not include a header row (a row that defines the column names), you will need to create the table definition manually. If the data set does contain a header record, you can use the following Python code to generate a CREATE TABLE statement. You will need to make sure that pandas is available. python3 -m pip install pandas --user Next run the python3 command in the shell to run an interactive Python session. python3 Place the following code into your Python window. def showcsv(catalog, schema, tablename, bucket, directory, csv_in): import pandas as pd df = pd.read_csv(csv_in,na_values=\"-\") df = df.fillna(0) column_headers = list(df.columns.values) print(\"\") print(f\"DROP TABLE IF EXISTS {catalog}.{schema}.{tablename};\") print(f\"CREATE TABLE {catalog}.{schema}.{tablename}\") print(\" (\") comma = \"\" end = \"\" for header in column_headers: print(f\"{comma}\",end=end) comma = \",\" end = \"\\n\" print(f' \"{header}\" varchar',end=\"\") print(f\" )\") print(f\"WITH (\") print(f\" format = 'CSV',\") print(f\" csv_separator = ',',\") print(f\" external_location = 's3a://{bucket}/{directory}'\") print(f\" );\") print(\"\") def makesql(): catalog = input(\"Catalog : \") schema = input(\"Schema : \") table = input(\"Table : \") bucket = input(\"Bucket : \") dir = input(\"Directory : \") csv = input(\"CSV File : \") showcsv(catalog,schema,table,bucket,dir,csv) Gather the following information on your dataset: catalog - The catalog the schema and table are created under ( hive_data ) schema - The schema name that you created to hold your table ( mpg ) table name - The name of the table ( fuel_economy ) bucket - Where the data is located ( hive-bucket ) directory - What directory contains your data ( fuel_economy ) csv_in - The location on your local machine where the csv file is Once you have gathered that, run the following command in your Python window and answer the prompts. makesql() >>> makesql() Catalog : hive_data Schema : mpg Table : fueleconomy Bucket : hive-bucket Directory : fuel_economy CSV File : ~/Downloads/fuel_economy_2013.csv DROP TABLE IF EXISTS hive_data.mpg.fueleconomy; CREATE TABLE hive_data.mpg.fueleconomy ( \"MODEL_YEAR\" varchar, \"MFR_NAME\" varchar, \"DIVISION\" varchar, \"CARLINE\" varchar, \"ENG_DISPL\" varchar, \"CYL\" varchar, \"TRANS_IN_FE_GUIDE\" varchar, \"CITY_FE_CONVENTIONAL_FUEL\" varchar, \"HWY_FE_CONVENTIONAL_FUEL\" varchar, \"COMB_FE_CONVENTIONAL_FUEL\" varchar, \"AIR_ASPIRATION_DESC\" varchar, \"TRANS_DESC\" varchar, \"GEARS\" varchar, \"DRIVE_DESC\" varchar, \"FUEL_UNIT_CONVENTIONAL_FUEL\" varchar, \"FUEL_UNIT_DESC_CONVENTIONAL_FUEL\" varchar, \"ANNUAL_FUEL_COST_CONVENTIONAL\" varchar, \"FUEL_METERING_SYS_DESC\" varchar ) WITH ( format = 'CSV', csv_separator = ',', external_location = 's3a://hive-bucket/fuel_economy' ); Cut and paste the output from the command into the watsonx.data Data Explorer window to create the file. Now you can query the table with the following SQL. Note that the header record still exists in the answer set since we did not remove it from the CSV file. SELECT * FROM hive_data.mpg.fueleconomy LIMIT 10","title":"Loading your own data"},{"location":"wxd-datasets-ontime/","text":"On-Time Performance Dataset The Airline On-Time performance database contains information on flights within the US from 1987 through 2020. This is a very large dataset, so only the records from January 2013 have been included inside this image. The following link provides more information on the dataset and the columns that are found in the records. Note that in the version of the data used in this system does not contain the diversion records 1 through 5. These fields are blank in the data sample used. Note that the initial diversion airport does exist in the record. Airline Report On-Time Performance Dataset Disclaimer Except as expressly set forth in this agreement, the data (including enhanced data) is provided on an \"as is\" basis, without warranties or conditions of any kind, either express or implied including, without limitation, any warranties or conditions of title, non-infringement, merchantability or fitness for a particular purpose. Neither you nor any data providers shall have any liability for any direct, indirect, incidental, special, exemplary, or consequential damages (including without limitation lost profits), however caused and on any theory of liability, whether in contract, strict liability, or tort (including negligence or otherwise) arising in any way out of the use or distribution of the data or the exercise of any rights granted hereunder, even if advised of the possibility of such damages. Tables AIRCRAFT Column Type TAIL_NUMBER VARCHAR MANUFACTURER VARCHAR MODEL VARCHAR AIRLINE_ID Column Type Code INT Description VARCHAR AIRPORT_ID Column Type Code INT Description VARCHAR CANCELLATION Column Type Code INT Description VARCHAR ONTIME Column Type Year INT Quarter INT Month INT DayofMonth INT DayOfWeek INT FlightDate VARCHAR Reporting_Airline VARCHAR DOT_ID_Reporting_Airline INT IATA_CODE_Reporting_Airline VARCHAR Tail_Number VARCHAR Flight_Number_Reporting_Airline INT OriginAirportID INT OriginAirportSeqID INT OriginCityMarketID INT Origin VARCHAR OriginCityName VARCHAR OriginState VARCHAR OriginStateFips VARCHAR OriginStateName VARCHAR OriginWac INT DestAirportID INT DestAirportSeqID INT DestCityMarketID INT Dest VARCHAR DestCityName VARCHAR DestState VARCHAR DestStateFips VARCHAR DestStateName VARCHAR DestWac INT CRSDepTime INT DepTime INT DepDelay INT DepDelayMinutes INT DepDel15 INT DepartureDelayGroups INT DepTimeBlk VARCHAR TaxiOut INT WheelsOff INT WheelsOn INT TaxiIn INT CRSArrTime INT ArrTime INT ArrDelay INT ArrDelayMinutes INT ArrDel15 INT ArrivalDelayGroups INT ArrTimeBlk VARCHAR Cancelled INT CancellationCode INT Diverted INT CRSElapsedTime INT ActualElapsedTime INT AirTime smallINT Flights INT Distance INT DistanceGroup INT CarrierDelay INT WeatherDelay INT NASDelay INT SecurityDelay INT LateAircraftDelay INT FirstDepTime INT TotalAddGTime INT LongestAddGTime INT DivAirportLandings INT DivReachedDest INT DivActualElapsedTime INT DivArrDelay INT DivDistance INT DivAirport VARCHAR","title":"Ontime Flight Performance"},{"location":"wxd-datasets-ontime/#on-time-performance-dataset","text":"The Airline On-Time performance database contains information on flights within the US from 1987 through 2020. This is a very large dataset, so only the records from January 2013 have been included inside this image. The following link provides more information on the dataset and the columns that are found in the records. Note that in the version of the data used in this system does not contain the diversion records 1 through 5. These fields are blank in the data sample used. Note that the initial diversion airport does exist in the record. Airline Report On-Time Performance Dataset","title":"On-Time Performance Dataset"},{"location":"wxd-datasets-ontime/#disclaimer","text":"Except as expressly set forth in this agreement, the data (including enhanced data) is provided on an \"as is\" basis, without warranties or conditions of any kind, either express or implied including, without limitation, any warranties or conditions of title, non-infringement, merchantability or fitness for a particular purpose. Neither you nor any data providers shall have any liability for any direct, indirect, incidental, special, exemplary, or consequential damages (including without limitation lost profits), however caused and on any theory of liability, whether in contract, strict liability, or tort (including negligence or otherwise) arising in any way out of the use or distribution of the data or the exercise of any rights granted hereunder, even if advised of the possibility of such damages.","title":"Disclaimer"},{"location":"wxd-datasets-ontime/#tables","text":"","title":"Tables"},{"location":"wxd-datasets-ontime/#aircraft","text":"Column Type TAIL_NUMBER VARCHAR MANUFACTURER VARCHAR MODEL VARCHAR","title":"AIRCRAFT"},{"location":"wxd-datasets-ontime/#airline_id","text":"Column Type Code INT Description VARCHAR","title":"AIRLINE_ID"},{"location":"wxd-datasets-ontime/#airport_id","text":"Column Type Code INT Description VARCHAR","title":"AIRPORT_ID"},{"location":"wxd-datasets-ontime/#cancellation","text":"Column Type Code INT Description VARCHAR","title":"CANCELLATION"},{"location":"wxd-datasets-ontime/#ontime","text":"Column Type Year INT Quarter INT Month INT DayofMonth INT DayOfWeek INT FlightDate VARCHAR Reporting_Airline VARCHAR DOT_ID_Reporting_Airline INT IATA_CODE_Reporting_Airline VARCHAR Tail_Number VARCHAR Flight_Number_Reporting_Airline INT OriginAirportID INT OriginAirportSeqID INT OriginCityMarketID INT Origin VARCHAR OriginCityName VARCHAR OriginState VARCHAR OriginStateFips VARCHAR OriginStateName VARCHAR OriginWac INT DestAirportID INT DestAirportSeqID INT DestCityMarketID INT Dest VARCHAR DestCityName VARCHAR DestState VARCHAR DestStateFips VARCHAR DestStateName VARCHAR DestWac INT CRSDepTime INT DepTime INT DepDelay INT DepDelayMinutes INT DepDel15 INT DepartureDelayGroups INT DepTimeBlk VARCHAR TaxiOut INT WheelsOff INT WheelsOn INT TaxiIn INT CRSArrTime INT ArrTime INT ArrDelay INT ArrDelayMinutes INT ArrDel15 INT ArrivalDelayGroups INT ArrTimeBlk VARCHAR Cancelled INT CancellationCode INT Diverted INT CRSElapsedTime INT ActualElapsedTime INT AirTime smallINT Flights INT Distance INT DistanceGroup INT CarrierDelay INT WeatherDelay INT NASDelay INT SecurityDelay INT LateAircraftDelay INT FirstDepTime INT TotalAddGTime INT LongestAddGTime INT DivAirportLandings INT DivReachedDest INT DivActualElapsedTime INT DivArrDelay INT DivDistance INT DivAirport VARCHAR","title":"ONTIME"},{"location":"wxd-datasets-taxi/","text":"Chicago Taxi Data Taxi trips are reported to the City of Chicago in its role as a regulatory agency. To protect privacy but allow for aggregate analyses, the Taxi ID is consistent for any given taxi medallion number but does not show the number. The data set used in this system contains records from January 1st, 2013 and does not include the census tract value nor the Taxi ID. Taxi Trips Disclaimer This site provides applications using data that has been modified for use from its original source, www.cityofchicago.org, the official website of the City of Chicago. The City of Chicago makes no claims as to the content, accuracy, timeliness, or completeness of the data provided at this site. The data provided at this site is subject to change at any time. It is understood that the data provided at this site is being used at one\u2019s own risk. Tables TAXIRIDES Column Type TRIP_ID int COMPANY varchar DROPOFF_LATITUDE double DROPOFF_LONGITUDE double EXTRAS double FARE double PAYMENT_TYPE varchar PICKUP_LATITUDE double PICKUP_LONGITUDE double TIPS double TOLLS double TRIP_END_TIMESTAMP timestamp TRIP_MILES double TRIP_SECONDS int TRIP_START_TIMESTAMP timestamp TRIP_TOTAL double","title":"Taxi Rides"},{"location":"wxd-datasets-taxi/#chicago-taxi-data","text":"Taxi trips are reported to the City of Chicago in its role as a regulatory agency. To protect privacy but allow for aggregate analyses, the Taxi ID is consistent for any given taxi medallion number but does not show the number. The data set used in this system contains records from January 1st, 2013 and does not include the census tract value nor the Taxi ID. Taxi Trips","title":"Chicago Taxi Data"},{"location":"wxd-datasets-taxi/#disclaimer","text":"This site provides applications using data that has been modified for use from its original source, www.cityofchicago.org, the official website of the City of Chicago. The City of Chicago makes no claims as to the content, accuracy, timeliness, or completeness of the data provided at this site. The data provided at this site is subject to change at any time. It is understood that the data provided at this site is being used at one\u2019s own risk.","title":"Disclaimer"},{"location":"wxd-datasets-taxi/#tables","text":"","title":"Tables"},{"location":"wxd-datasets-taxi/#taxirides","text":"Column Type TRIP_ID int COMPANY varchar DROPOFF_LATITUDE double DROPOFF_LONGITUDE double EXTRAS double FARE double PAYMENT_TYPE varchar PICKUP_LATITUDE double PICKUP_LONGITUDE double TIPS double TOLLS double TRIP_END_TIMESTAMP timestamp TRIP_MILES double TRIP_SECONDS int TRIP_START_TIMESTAMP timestamp TRIP_TOTAL double","title":"TAXIRIDES"},{"location":"wxd-datasets/","text":"Datasets There are three datasets that have been loaded into watsonx.data system for you to use while exploring the features of the product. These links will give you more details on each of the data sets, including options for loading your own data into this environment. Great Outdoors Company warehouse data Airline On-Time performance Taxi fares For information on other sources of data and how to import that data, see the following links. Alternate Data Sets Loading External Datasets","title":"Datasets"},{"location":"wxd-datasets/#datasets","text":"There are three datasets that have been loaded into watsonx.data system for you to use while exploring the features of the product. These links will give you more details on each of the data sets, including options for loading your own data into this environment. Great Outdoors Company warehouse data Airline On-Time performance Taxi fares For information on other sources of data and how to import that data, see the following links. Alternate Data Sets Loading External Datasets","title":"Datasets"},{"location":"wxd-dbeaver/","text":"dBeaver Client Tool You could use any tool that supports connectivity through JDBC drivers to connect to watsonx.data, but we chose to use dBeaver for this lab. dBeaver is a client tool that we can use to connect to watsonx.data and execute queries etc. The tool has been installed in the watsonx users home directory. To access dBeaver, you must use the VNC service which has been installed on this server for you. Start dBeaver Locally To start dBeaver, you must be connected to the VM console of the Linux server as the watsonx user (see Accessing the Console ). In the virtual machine, click on the Applications button, choose the Database folder and click on the dBeaver icon. The start-up screen for dBeaver will display. The dBeaver program may ask if you want to create an empty database or update the release. Just say No. The first dialog from dBeaver will ask you to create a database connection. If you do not see this screen, select Database, and then select New Database Connection: Catalog watsonx.data Connection We will use the PrestoDB JDBC connector (NOT PrestoSQL). This is the other name for Trino, a variant of PrestoDB which might work. Select SQL (see Left side) and scroll down until you see PrestoDB. Select PrestoDB and then press \"Next\". The following screen will be displayed. Enter the following values into the dialog. Note : These settings are case-sensitive. Host: localhost Port: 8443 Username: ibmlhadmin Password: password Database: tpch Then select the Driver Properties tab. You might be asked to download the database driver. Make sure select \"Force Download\" otherwise it will not properly download the driver. Once downloaded it will display the Driver properties dialog. Press the [+] button on the bottom left of the User Properties list. You need to enter three properties: SSL True SSLTrustStorePath /certs/presto-key.jks SSLTrustStorePassword watsonx.data Enter the property name \"SSL\", in uppercase (the parameter is case-sensitive!). When you hit OK it will display the setting in the list. Click on the SSL field and you will update the value to True and hit Enter. Add another field called SSLTrustStorePath and give it value of /certs/presto-key.jks and finally add the SSLTrustStorePassword setting with a value of watsonx.data . The panel should now contain three values. Press Finish when done. You should now see the TPCH database on the left panel. Clicking on the >TPCH line should display the objects that are found in the database. You can now use dBeaver to navigate through the different schemas in the Presto database. The iceberg_data schema should also be visible in the dBeaver console. Open the iceberg_data catalog and search for the customer table under workshop schema. This schema will only exist if you created it in the previous section on MinIO.","title":"dBeaver"},{"location":"wxd-dbeaver/#dbeaver-client-tool","text":"You could use any tool that supports connectivity through JDBC drivers to connect to watsonx.data, but we chose to use dBeaver for this lab. dBeaver is a client tool that we can use to connect to watsonx.data and execute queries etc. The tool has been installed in the watsonx users home directory. To access dBeaver, you must use the VNC service which has been installed on this server for you.","title":"dBeaver Client Tool"},{"location":"wxd-dbeaver/#start-dbeaver-locally","text":"To start dBeaver, you must be connected to the VM console of the Linux server as the watsonx user (see Accessing the Console ). In the virtual machine, click on the Applications button, choose the Database folder and click on the dBeaver icon. The start-up screen for dBeaver will display. The dBeaver program may ask if you want to create an empty database or update the release. Just say No. The first dialog from dBeaver will ask you to create a database connection. If you do not see this screen, select Database, and then select New Database Connection:","title":"Start dBeaver Locally"},{"location":"wxd-dbeaver/#catalog-watsonxdata-connection","text":"We will use the PrestoDB JDBC connector (NOT PrestoSQL). This is the other name for Trino, a variant of PrestoDB which might work. Select SQL (see Left side) and scroll down until you see PrestoDB. Select PrestoDB and then press \"Next\". The following screen will be displayed. Enter the following values into the dialog. Note : These settings are case-sensitive. Host: localhost Port: 8443 Username: ibmlhadmin Password: password Database: tpch Then select the Driver Properties tab. You might be asked to download the database driver. Make sure select \"Force Download\" otherwise it will not properly download the driver. Once downloaded it will display the Driver properties dialog. Press the [+] button on the bottom left of the User Properties list. You need to enter three properties: SSL True SSLTrustStorePath /certs/presto-key.jks SSLTrustStorePassword watsonx.data Enter the property name \"SSL\", in uppercase (the parameter is case-sensitive!). When you hit OK it will display the setting in the list. Click on the SSL field and you will update the value to True and hit Enter. Add another field called SSLTrustStorePath and give it value of /certs/presto-key.jks and finally add the SSLTrustStorePassword setting with a value of watsonx.data . The panel should now contain three values. Press Finish when done. You should now see the TPCH database on the left panel. Clicking on the >TPCH line should display the objects that are found in the database. You can now use dBeaver to navigate through the different schemas in the Presto database. The iceberg_data schema should also be visible in the dBeaver console. Open the iceberg_data catalog and search for the customer table under workshop schema. This schema will only exist if you created it in the previous section on MinIO.","title":"Catalog watsonx.data Connection"},{"location":"wxd-disclaimer/","text":"Disclaimer Watson.data Copyright \u00a9 2024 by International Business Machines Corporation (IBM). All rights reserved. Printed in Canada. Except as permitted under the Copyright Act of 1976, no part of this publication may be reproduced or distributed in any form or by any means, or stored in a database or retrieval system, without the prior written permission of IBM, with the exception that the program listings may be entered, stored, and executed in a computer system, but they may not be reproduced for publication. The contents of this lab represent those features that may or may not be available in the current release of any products mentioned within this lab despite what the lab may say. IBM reserves the right to include or exclude any functionality mentioned in this lab for the current release of watsonx.data, or a subsequent release. In addition, any claims made in this lab are not official communications by IBM; rather, they are observed by the authors in unaudited testing and research. The views expressed in this lab is those of the authors and not necessarily those of the IBM Corporation; both are not liable for any of the claims, assertions, or contents in this lab. IBM's statements regarding its plans, directions, and intent are subject to change or withdrawal without notice and at IBM's sole discretion. Information regarding potential future products is intended to outline our general product direction and it should not be relied on in making a purchasing decision. The information mentioned regarding potential future products is not a commitment, promise, or legal obligation to deliver any material, code, or functionality. Information about potential future products may not be incorporated into any contract. The development, release, and timing of any future feature or functionality described for our products remains at our sole discretion. Performance is based on measurements and projections using standard IBM benchmarks in a controlled environment. The actual throughput or performance that any user will experience will vary depending upon many factors, including considerations such as the amount of multiprogramming in the user's job stream, the I/O configuration, the storage configuration, and the workload processed. Therefore, no assurance can be given that an individual user will achieve results like those stated here. U.S. Government Users Restricted Rights - Use, duplication or disclosure restricted by GSA ADP Schedule Contract with IBM. Information in this eBook (including information relating to products that have not yet been announced by IBM) has been reviewed for accuracy as of the date of initial publication and could include unintentional technical or typographical errors. IBM shall have no responsibility to update this information. THIS DOCUMENT IS DISTRIBUTED \"AS IS\" WITHOUT ANY WARRANTY, EITHER EXPRESS OR IMPLIED. IN NO EVENT SHALL IBM BE LIABLE FOR ANY DAMAGE ARISING FROM THE USE OF THIS INFORMATION, INCLUDING BUT NOT LIMITED TO, LOSS OF DATA, BUSINESS INTERRUPTION, LOSS OF PROFIT OR LOSS OF OPPORTUNITY. IBM products and services are warranted according to the terms and conditions of the agreements under which they are provided. References in this document to IBM products, programs, or services does not imply that IBM intends to make such products, programs, or services available in all countries in which IBM operates or does business. Information concerning non-IBM products was obtained from the suppliers of those products, their published announcements, or other publicly available sources. IBM has not tested those products in connection with this publication and cannot confirm the accuracy of performance, compatibility or any other claims related to non-IBM products. Questions on the capabilities of non-IBM products should be addressed to the suppliers of those products. IBM does not warrant the quality of any third-party products, or the ability of any such third-party products to interoperate with IBM's products. IBM EXPRESSLY DISCLAIMS ALL WARRANTIES, EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. The provision of the information contained herein is not intended to, and does not, grant any right or license under any IBM patents, copyrights, trademarks, or other intellectual property right. IBM, the IBM logo, ibm.com, Aspera\u00ae, Bluemix, Blueworks Live, CICS, Clearcase, Cognos\u00ae, DOORS\u00ae, Emptoris\u00ae, Enterprise Document Management System\u2122, FASP\u00ae, FileNet\u00ae, Global Business Services \u00ae, Global Technology Services \u00ae, IBM ExperienceOne\u2122, IBM SmartCloud\u00ae, IBM Social Business\u00ae, Information on Demand, ILOG, Maximo\u00ae, MQIntegrator\u00ae, MQSeries\u00ae, Netcool\u00ae, OMEGAMON, OpenPower, PureAnalytics\u2122, PureApplication\u00ae, pureCluster\u2122, PureCoverage\u00ae, PureData\u00ae, PureExperience\u00ae, PureFlex\u00ae, pureQuery\u00ae, pureScale\u00ae, PureSystems\u00ae, QRadar\u00ae, Rational\u00ae, Rhapsody\u00ae, Smarter Commerce\u00ae, SoDA, SPSS, Sterling Commerce\u00ae, StoredIQ, Tealeaf\u00ae, Tivoli\u00ae, Trusteer\u00ae, Unica\u00ae, urban{code}\u00ae, Watson, WebSphere\u00ae, Worklight\u00ae, X-Force\u00ae and System z\u00ae Z/OS, are trademarks of International Business Machines Corporation, registered in many jurisdictions worldwide. Other product and service names might be trademarks of IBM or other companies. A current list of IBM trademarks is available on the Web at \"Copyright and trademark information\" at: www.ibm.com/legal/copytrade.shtml. All trademarks or copyrights mentioned herein are the possession of their respective owners and IBM makes no claim of ownership by the mention of products that contain these marks.","title":"Disclaimer"},{"location":"wxd-disclaimer/#disclaimer","text":"","title":"Disclaimer"},{"location":"wxd-disclaimer/#watsondata","text":"Copyright \u00a9 2024 by International Business Machines Corporation (IBM). All rights reserved. Printed in Canada. Except as permitted under the Copyright Act of 1976, no part of this publication may be reproduced or distributed in any form or by any means, or stored in a database or retrieval system, without the prior written permission of IBM, with the exception that the program listings may be entered, stored, and executed in a computer system, but they may not be reproduced for publication. The contents of this lab represent those features that may or may not be available in the current release of any products mentioned within this lab despite what the lab may say. IBM reserves the right to include or exclude any functionality mentioned in this lab for the current release of watsonx.data, or a subsequent release. In addition, any claims made in this lab are not official communications by IBM; rather, they are observed by the authors in unaudited testing and research. The views expressed in this lab is those of the authors and not necessarily those of the IBM Corporation; both are not liable for any of the claims, assertions, or contents in this lab. IBM's statements regarding its plans, directions, and intent are subject to change or withdrawal without notice and at IBM's sole discretion. Information regarding potential future products is intended to outline our general product direction and it should not be relied on in making a purchasing decision. The information mentioned regarding potential future products is not a commitment, promise, or legal obligation to deliver any material, code, or functionality. Information about potential future products may not be incorporated into any contract. The development, release, and timing of any future feature or functionality described for our products remains at our sole discretion. Performance is based on measurements and projections using standard IBM benchmarks in a controlled environment. The actual throughput or performance that any user will experience will vary depending upon many factors, including considerations such as the amount of multiprogramming in the user's job stream, the I/O configuration, the storage configuration, and the workload processed. Therefore, no assurance can be given that an individual user will achieve results like those stated here. U.S. Government Users Restricted Rights - Use, duplication or disclosure restricted by GSA ADP Schedule Contract with IBM. Information in this eBook (including information relating to products that have not yet been announced by IBM) has been reviewed for accuracy as of the date of initial publication and could include unintentional technical or typographical errors. IBM shall have no responsibility to update this information. THIS DOCUMENT IS DISTRIBUTED \"AS IS\" WITHOUT ANY WARRANTY, EITHER EXPRESS OR IMPLIED. IN NO EVENT SHALL IBM BE LIABLE FOR ANY DAMAGE ARISING FROM THE USE OF THIS INFORMATION, INCLUDING BUT NOT LIMITED TO, LOSS OF DATA, BUSINESS INTERRUPTION, LOSS OF PROFIT OR LOSS OF OPPORTUNITY. IBM products and services are warranted according to the terms and conditions of the agreements under which they are provided. References in this document to IBM products, programs, or services does not imply that IBM intends to make such products, programs, or services available in all countries in which IBM operates or does business. Information concerning non-IBM products was obtained from the suppliers of those products, their published announcements, or other publicly available sources. IBM has not tested those products in connection with this publication and cannot confirm the accuracy of performance, compatibility or any other claims related to non-IBM products. Questions on the capabilities of non-IBM products should be addressed to the suppliers of those products. IBM does not warrant the quality of any third-party products, or the ability of any such third-party products to interoperate with IBM's products. IBM EXPRESSLY DISCLAIMS ALL WARRANTIES, EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. The provision of the information contained herein is not intended to, and does not, grant any right or license under any IBM patents, copyrights, trademarks, or other intellectual property right. IBM, the IBM logo, ibm.com, Aspera\u00ae, Bluemix, Blueworks Live, CICS, Clearcase, Cognos\u00ae, DOORS\u00ae, Emptoris\u00ae, Enterprise Document Management System\u2122, FASP\u00ae, FileNet\u00ae, Global Business Services \u00ae, Global Technology Services \u00ae, IBM ExperienceOne\u2122, IBM SmartCloud\u00ae, IBM Social Business\u00ae, Information on Demand, ILOG, Maximo\u00ae, MQIntegrator\u00ae, MQSeries\u00ae, Netcool\u00ae, OMEGAMON, OpenPower, PureAnalytics\u2122, PureApplication\u00ae, pureCluster\u2122, PureCoverage\u00ae, PureData\u00ae, PureExperience\u00ae, PureFlex\u00ae, pureQuery\u00ae, pureScale\u00ae, PureSystems\u00ae, QRadar\u00ae, Rational\u00ae, Rhapsody\u00ae, Smarter Commerce\u00ae, SoDA, SPSS, Sterling Commerce\u00ae, StoredIQ, Tealeaf\u00ae, Tivoli\u00ae, Trusteer\u00ae, Unica\u00ae, urban{code}\u00ae, Watson, WebSphere\u00ae, Worklight\u00ae, X-Force\u00ae and System z\u00ae Z/OS, are trademarks of International Business Machines Corporation, registered in many jurisdictions worldwide. Other product and service names might be trademarks of IBM or other companies. A current list of IBM trademarks is available on the Web at \"Copyright and trademark information\" at: www.ibm.com/legal/copytrade.shtml. All trademarks or copyrights mentioned herein are the possession of their respective owners and IBM makes no claim of ownership by the mention of products that contain these marks.","title":"Watson.data"},{"location":"wxd-federation/","text":"Federation with watsonx.data Watsonx.data can federate data from other data sources, there are a few out of box connectors and one could create additional connectors using the SDK if need be (This does involve some programming and testing effort) and not a trivial exercise. We will use the existing PostgreSQL instance, add some data, and test the federation capabilities. Open the developer sandbox and use existing scripts to create a PostgreSQL database and add some data. Switch to the bin directory as the root user. cd /root/ibm-lh-dev/bin Connect to the sandbox. ./dev-sandbox.sh Create the database. /scripts/create_db.sh pgdatadb exists result: CREATE DATABASE Connect to the Database.quit; /scripts/runsql.sh pgdatadb psql (11.19, server 13.4 (Debian 13.4-4.pgdg110+1)) WARNING: psql major version 11, server major version 13. Some psql features might not work. Type \"help\" for help. Create a Table. create table t1( c1 int, c2 int); CREATE TABLE Insert some sample data. insert into t1 values(1,2); INSERT 0 1 Quit Postgres. quit Quit Sandbox. exit PostgreSQL Properties To set up federation, we need to get the credentials for the PostgreSQL database. Use the following command to get the database password. export POSTGRES_PASSWORD=$(docker exec ibm-lh-postgres printenv | grep POSTGRES_PASSWORD | sed 's/.*=//') echo \"Postgres Userid : admin\" echo \"Postgres Password : \" $POSTGRES_PASSWORD echo $POSTGRES_PASSWORD > /tmp/postgres.pw Open your browser and navigate to: Watsonx.data UI - https://region.techzone-server.com:port Credentials: username: ibmlhadmin password: password Navigate to the Infrastructure manager by clicking on the icon below the Home symbol. You should see a panel like the following. On the top right-hand corner, select Add Component->Add database. The Add database dialog is displayed. Enter the following values: Database type \u2013 PostgreSQL Database name \u2013 pgdatadb Hostname \u2013 ibm-lh-postgres Port \u2013 5432 Display name \u2013 pgdatadb Username \u2013 admin Password \u2013 The value that was extracted in the earlier step Catalog Name \u2013 pgdatadb Your screen should look like the one below. You can press the \"Test\" button to check to see if the connection settings are correct. Once you are satisfied with the settings, press \"Add\". The infrastructure screen should now show the Postgres database. What we are currently missing the connection between the Presto engine and the Postgres data in pgdatadb. We must connect the pgdatadb database to the Presto engine. Use your mouse to hover over the pgdatadb icon until you see the Associate connection icon: Click on the association icon. You should see the following confirmation dialog: Select the presto-01 engine and press Save and restart engine . Press the Associate button and the screen will update to show the connection. Presto Federation First check to make sure that the Presto engine has finished starting. While the watsonx.data UI has restarted the Presto process, it takes a few seconds to become available. check_presto When the command comes back as Ready, you can start using the Presto CLI. Connect to watsonx.data and try Federation. ./presto-cli --catalog pgdatadb Show the current schemas. show schemas; Schema -------------------- pg_catalog public (2 rows) Use the public schema. use public; Select the table we created in Postgres. select * from public.t1; c1 | c2 ----+---- 1 | 2 (1 row) Join with data from other schemas (Sample TPCH+PostgreSQL). select t1.*,customer.name from tpch.tiny.customer, pgdatadb.public.t1 limit 10; c1 | c2 | name ----+----+-------------------- 1 | 2 | Customer#000000001 1 | 2 | Customer#000000002 1 | 2 | Customer#000000003 1 | 2 | Customer#000000004 1 | 2 | Customer#000000005 1 | 2 | Customer#000000006 1 | 2 | Customer#000000007 1 | 2 | Customer#000000008 (10 rows) Quit Presto. quit;","title":"Federation"},{"location":"wxd-federation/#federation-with-watsonxdata","text":"Watsonx.data can federate data from other data sources, there are a few out of box connectors and one could create additional connectors using the SDK if need be (This does involve some programming and testing effort) and not a trivial exercise. We will use the existing PostgreSQL instance, add some data, and test the federation capabilities. Open the developer sandbox and use existing scripts to create a PostgreSQL database and add some data. Switch to the bin directory as the root user. cd /root/ibm-lh-dev/bin Connect to the sandbox. ./dev-sandbox.sh Create the database. /scripts/create_db.sh pgdatadb exists result: CREATE DATABASE Connect to the Database.quit; /scripts/runsql.sh pgdatadb psql (11.19, server 13.4 (Debian 13.4-4.pgdg110+1)) WARNING: psql major version 11, server major version 13. Some psql features might not work. Type \"help\" for help. Create a Table. create table t1( c1 int, c2 int); CREATE TABLE Insert some sample data. insert into t1 values(1,2); INSERT 0 1 Quit Postgres. quit Quit Sandbox. exit","title":"Federation with watsonx.data"},{"location":"wxd-federation/#postgresql-properties","text":"To set up federation, we need to get the credentials for the PostgreSQL database. Use the following command to get the database password. export POSTGRES_PASSWORD=$(docker exec ibm-lh-postgres printenv | grep POSTGRES_PASSWORD | sed 's/.*=//') echo \"Postgres Userid : admin\" echo \"Postgres Password : \" $POSTGRES_PASSWORD echo $POSTGRES_PASSWORD > /tmp/postgres.pw Open your browser and navigate to: Watsonx.data UI - https://region.techzone-server.com:port Credentials: username: ibmlhadmin password: password Navigate to the Infrastructure manager by clicking on the icon below the Home symbol. You should see a panel like the following. On the top right-hand corner, select Add Component->Add database. The Add database dialog is displayed. Enter the following values: Database type \u2013 PostgreSQL Database name \u2013 pgdatadb Hostname \u2013 ibm-lh-postgres Port \u2013 5432 Display name \u2013 pgdatadb Username \u2013 admin Password \u2013 The value that was extracted in the earlier step Catalog Name \u2013 pgdatadb Your screen should look like the one below. You can press the \"Test\" button to check to see if the connection settings are correct. Once you are satisfied with the settings, press \"Add\". The infrastructure screen should now show the Postgres database. What we are currently missing the connection between the Presto engine and the Postgres data in pgdatadb. We must connect the pgdatadb database to the Presto engine. Use your mouse to hover over the pgdatadb icon until you see the Associate connection icon: Click on the association icon. You should see the following confirmation dialog: Select the presto-01 engine and press Save and restart engine . Press the Associate button and the screen will update to show the connection.","title":"PostgreSQL Properties"},{"location":"wxd-federation/#presto-federation","text":"First check to make sure that the Presto engine has finished starting. While the watsonx.data UI has restarted the Presto process, it takes a few seconds to become available. check_presto When the command comes back as Ready, you can start using the Presto CLI. Connect to watsonx.data and try Federation. ./presto-cli --catalog pgdatadb Show the current schemas. show schemas; Schema -------------------- pg_catalog public (2 rows) Use the public schema. use public; Select the table we created in Postgres. select * from public.t1; c1 | c2 ----+---- 1 | 2 (1 row) Join with data from other schemas (Sample TPCH+PostgreSQL). select t1.*,customer.name from tpch.tiny.customer, pgdatadb.public.t1 limit 10; c1 | c2 | name ----+----+-------------------- 1 | 2 | Customer#000000001 1 | 2 | Customer#000000002 1 | 2 | Customer#000000003 1 | 2 | Customer#000000004 1 | 2 | Customer#000000005 1 | 2 | Customer#000000006 1 | 2 | Customer#000000007 1 | 2 | Customer#000000008 (10 rows) Quit Presto. quit;","title":"Presto Federation"},{"location":"wxd-glossary/","text":"Glossary Apache Superset : Apache Superset is an open-source software application for data exploration and data visualization able to handle data at petabyte scale. Apache Superset is a modern, enterprise-ready business intelligence web application. It is fast, lightweight, intuitive, and loaded with options that make it easy for users of all skill sets to explore and visualize their data, from simple pie charts to highly detailed geospatial charts. Application Programming Interface (API)**: Application Programming Interface (API) is a programmatic interface for executing functions of an application in an automated or manual fashion without using a CLI or User Interface. Buckets : Buckets are the basic containers that hold your data. Everything that you store in Cloud Storage must be contained in a bucket. You can use buckets to organize your data and control access to your data, but unlike directories and folders, you cannot nest buckets. Catalog : This term may have many meanings depending on context. Review below: Service Catalog - A service catalog is a comprehensive list of cloud computing services that an organization offers its customers. The catalog is the only portion of the company's service portfolio that is published and provided to customers as a support to the sale or delivery of offered services. Data Catalog - A collection of business information describing the available datasets within an organization. Metastore Catalog - A collection of technical and operational metadata allowing a query engine to overlay a virtual table on a collection of discrete data files. Connector Catalog - The named representation of a connector within the virtual warehouse of a presto instance. Command Line Interface (CLI) : A command-line interface (CLI) is a text-based user interface (UI) used to run programs, manage computer files and interact with the computer. dBeaver : DBeaver is a SQL client software application and a database administration tool. For relational databases it uses the JDBC application programming interface to interact with databases via a JDBC driver. For other databases it uses proprietary database drivers. Federation : A federated database is a system in which several databases appear to function as a single entity. Each component database in the system is completely self-sustained and functional. When an application queries the federated database, the system figures out which of its component databases contains the data being requested and passes the request to it. Federated databases can be thought of as database virtualization in much the same way that storage virtualization makes several drives appear as one. MinIO : MinIO is a high-performance, S3 compatible object store. It is built for large scale AI/ML, data lake and database workloads. It runs on-prem and on any cloud (public or private) and from the data center to the edge. MinIO is software-defined and open source under GNU AGPL v3. Object Storage : Object storage is a data storage architecture for storing unstructured data, which sections data into units\u2014objects\u2014and stores them in a structurally flat data environment. Each object includes the data, metadata, and a unique identifier that applications can use for easy access and retrieval. Presto : Presto is a distributed database query engine (written in Java) that uses the SQL query language. Its architecture allows users to query data sources such as Hadoop, Cassandra, Kafka, AWS S3, Alluxio, MySQL, MongoDB and Teradata, and allows use of multiple data sources within a query. Presto is community-driven open-source software released under the Apache License. Presto's architecture is very similar to other database management systems using cluster computing, sometimes called massively parallel processing (MPP). SPARK : Apache Spark is an open-source unified analytics engine for large-scale data processing. Spark provides an interface for programming clusters with implicit data parallelism and fault tolerance. Spark can be used with watsonx.data but is not included in the watsonx.data environment image provided. TechZone (IBM Technology Zone) : IBM Technology Zone is the platform where the developer edition of watsonx.data with the sample data sets has been provisioned. Generally, it allows Go To Market teams and Business Partners to easily build technical 'Show Me' live environments, POTs, prototypes, and MVPs, which can then be customized and shared with peers and customers to experience IBM Technology. VNC (Virtual Network Computing) : VNC is a cross-platform screen-sharing system that uses the Remote Frame Buffer (RFB) protocol. VNC was created to control another computer remotely. You may know it best for its role in tech support services. Use of VNC is optional. VNC can be used after the WireGuard VPN has been activated to access the watsonx.data server. WireGuard : WireGuard is a communication protocol and free and open-source software that implements encrypted virtual private networks, and was designed with the goals of ease of use, high speed performance, and low attack surface. You will need to install the Wireguard software and download the server VPN certificate in order to access the watsonx.data server.","title":"Glossary"},{"location":"wxd-glossary/#glossary","text":"Apache Superset : Apache Superset is an open-source software application for data exploration and data visualization able to handle data at petabyte scale. Apache Superset is a modern, enterprise-ready business intelligence web application. It is fast, lightweight, intuitive, and loaded with options that make it easy for users of all skill sets to explore and visualize their data, from simple pie charts to highly detailed geospatial charts. Application Programming Interface (API)**: Application Programming Interface (API) is a programmatic interface for executing functions of an application in an automated or manual fashion without using a CLI or User Interface. Buckets : Buckets are the basic containers that hold your data. Everything that you store in Cloud Storage must be contained in a bucket. You can use buckets to organize your data and control access to your data, but unlike directories and folders, you cannot nest buckets. Catalog : This term may have many meanings depending on context. Review below: Service Catalog - A service catalog is a comprehensive list of cloud computing services that an organization offers its customers. The catalog is the only portion of the company's service portfolio that is published and provided to customers as a support to the sale or delivery of offered services. Data Catalog - A collection of business information describing the available datasets within an organization. Metastore Catalog - A collection of technical and operational metadata allowing a query engine to overlay a virtual table on a collection of discrete data files. Connector Catalog - The named representation of a connector within the virtual warehouse of a presto instance. Command Line Interface (CLI) : A command-line interface (CLI) is a text-based user interface (UI) used to run programs, manage computer files and interact with the computer. dBeaver : DBeaver is a SQL client software application and a database administration tool. For relational databases it uses the JDBC application programming interface to interact with databases via a JDBC driver. For other databases it uses proprietary database drivers. Federation : A federated database is a system in which several databases appear to function as a single entity. Each component database in the system is completely self-sustained and functional. When an application queries the federated database, the system figures out which of its component databases contains the data being requested and passes the request to it. Federated databases can be thought of as database virtualization in much the same way that storage virtualization makes several drives appear as one. MinIO : MinIO is a high-performance, S3 compatible object store. It is built for large scale AI/ML, data lake and database workloads. It runs on-prem and on any cloud (public or private) and from the data center to the edge. MinIO is software-defined and open source under GNU AGPL v3. Object Storage : Object storage is a data storage architecture for storing unstructured data, which sections data into units\u2014objects\u2014and stores them in a structurally flat data environment. Each object includes the data, metadata, and a unique identifier that applications can use for easy access and retrieval. Presto : Presto is a distributed database query engine (written in Java) that uses the SQL query language. Its architecture allows users to query data sources such as Hadoop, Cassandra, Kafka, AWS S3, Alluxio, MySQL, MongoDB and Teradata, and allows use of multiple data sources within a query. Presto is community-driven open-source software released under the Apache License. Presto's architecture is very similar to other database management systems using cluster computing, sometimes called massively parallel processing (MPP). SPARK : Apache Spark is an open-source unified analytics engine for large-scale data processing. Spark provides an interface for programming clusters with implicit data parallelism and fault tolerance. Spark can be used with watsonx.data but is not included in the watsonx.data environment image provided. TechZone (IBM Technology Zone) : IBM Technology Zone is the platform where the developer edition of watsonx.data with the sample data sets has been provisioned. Generally, it allows Go To Market teams and Business Partners to easily build technical 'Show Me' live environments, POTs, prototypes, and MVPs, which can then be customized and shared with peers and customers to experience IBM Technology. VNC (Virtual Network Computing) : VNC is a cross-platform screen-sharing system that uses the Remote Frame Buffer (RFB) protocol. VNC was created to control another computer remotely. You may know it best for its role in tech support services. Use of VNC is optional. VNC can be used after the WireGuard VPN has been activated to access the watsonx.data server. WireGuard : WireGuard is a communication protocol and free and open-source software that implements encrypted virtual private networks, and was designed with the goals of ease of use, high speed performance, and low attack surface. You will need to install the Wireguard software and download the server VPN certificate in order to access the watsonx.data server.","title":"Glossary"},{"location":"wxd-ingest/","text":"Ingesting Data In this lab we will use the ingest tool (lh-tool) alongside the IBM watsonx.data developer edition that is running in this lab. The Ingest tool is a separate install and currently needs to be downloaded after IBM watsonx.data is started. The lab image contains a copy of this code, so you will not need to download it. In addition, there is a staging file (yellowtaxi-parquet) found in the sample data directory that will be used for loading data into the system. As the root user, switch to the client bin directory. cd /root/ibm-lh-client/bin Ingest data into the IBM watsonx.data Before running the utility, we need to retrieve several credentials for MinIO and the keystore password. export LH_S3_ACCESS_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_ACCESS_KEY | sed 's/.*=//') export LH_S3_SECRET_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_SECRET_KEY | sed 's/.*=//') export LH_KEYSTORE_PASSWORD=$(docker exec ibm-lh-presto printenv | grep LH_KEYSTORE_PASSWORD | sed 's/.*=//') We need to generate three export lines and create a directory that will be used later in another script. The staging directory is used to move files between this system and the docker container that the ibm-lh tool is running in. export staging=/root/ibm-lh-client/localstorage/volumes/infra/staging mkdir -p ${staging} cat <<EOF > ${staging}/keys.sh #!/bin/bash export access_key=$LH_S3_ACCESS_KEY export secret_key=$LH_S3_SECRET_KEY export keystore_password=$LH_KEYSTORE_PASSWORD EOF chmod +x ${staging}/keys.sh A copy of the MinIO SSL certificate needs to be copied from the docker container. In addition, the file that we want loaded into watsonx.data will be moved into the staging file directory. docker cp ibm-lh-presto:/mnt/infra/tls/lh-ssl-ts.jks ${staging}/lh-ssl-ts.jks \\cp -f /sampledata/yellow_tripdata_2022-01.parquet ${staging}/. Create a hive schema for staging the ingest file Before ingesting the file, we need to create a new schema that we will use for the table being loaded. Your TechZone reservation will include the server name and port number to connect to the watsonx.data UI. In the watsonx.data UI select the Data Explorer. You should see a screen like the following. Use the \"Create\" pulldown and select Create schema in the hive_data catalog. Select the hive_data catalog and use staging as the new schema name. Press the Create button to finish the creation of the schema. You should see the new staging schema under hive_data . You need to repeat the same process again, but this time you are going to add a schema called ingest in the iceberg_data catalog. You should see the new ingest schema in the navigator screen. Start the IBM tools Container To access the tools container, we issue the ibm-lh command with the data-load option. ./ibm-lh data-copy /////////////////////////////////////// /////////////////////////////////////// _ _ _ _ | |__ _, ,_ | || |_ _ | || '_ \\ / /\\//| |_ _| || |_ | | || |_) || | | |_ _| || | | | |_||_.__/ |_| |_| |_||_| |_| /////////////////////////////////////// /////////////////////////////////////// Once inside the utility, you can use the following command to get more details on the data-copy option. ibm-lh data-copy --help Exit from the tools container by using the exit command. exit The following script is used to ingest the taxi data (yellow_tripdata_2022_01) into watsonx.data. Choose the script which corresponds to the release of watsonx.data that you are currently running. Watsonx.data Version 1.0.3 cat <<EOF > ${staging}/ingest-local.sh #!/bin/bash dir=/mnt/infra/staging cd \\${dir} source ./keys.sh table_name=\"iceberg_data.ingest.yellow_tripdata_2022_01_localfile\" file=\"yellow_tripdata_2022-01.parquet\" ibm-lh data-copy \\\\ --source-data-files \\${dir}/\\${file} \\\\ --target-tables \\${table_name} \\\\ --ingestion-engine-endpoint \"hostname=ibm-lh-presto-svc,port=8443\" \\\\ --staging-location s3://iceberg-bucket/ingest/ \\ --staging-hive-catalog hive_data \\ --staging-hive-schema staging \\ --staging-s3-creds \\\\ \"AWS_SECRET_ACCESS_KEY=\\${secret_key}\\\\ ,AWS_ACCESS_KEY_ID=\\${access_key}\\\\ ,AWS_REGION=us-east-1\\\\ ,BUCKET_NAME=iceberg-bucket\\\\ ,ENDPOINT_URL=http://ibm-lh-minio:9000\" \\\\ --create-if-not-exist \\\\ --trust-store-path \\${dir}/lh-ssl-ts.jks \\\\ --trust-store-password \\${keystore_password} \\\\ --dbuser ibmlhadmin \\\\ --dbpassword password EOF sed -i '/^$/d' ${staging}/ingest-local.sh chmod +x ${staging}/ingest-local.sh Watsonx.data Version 1.1.0 In version 1.1.0, the --staging-s3-creds are replaced with an environment variable called STAGING_S3_CREDS . cat <<EOF > ${staging}/ingest-local.sh #!/bin/bash dir=/mnt/infra/staging cd \\${dir} source ./keys.sh table_name=\"iceberg_data.ingest.yellow_tripdata_2022_01_localfile\" file=\"yellow_tripdata_2022-01.parquet\" export STAGING_S3_CREDS=\"AWS_SECRET_ACCESS_KEY=\\${secret_key}\\\\ ,AWS_ACCESS_KEY_ID=\\${access_key}\\\\ ,AWS_REGION=us-east-1\\\\ ,BUCKET_NAME=iceberg-bucket\\\\ ,ENDPOINT_URL=http://ibm-lh-minio:9000\" ibm-lh data-copy \\\\ --source-data-files \\${dir}/\\${file} \\\\ --target-tables \\${table_name} \\\\ --ingestion-engine-endpoint \"hostname=ibm-lh-presto-svc,port=8443\" \\\\ --staging-location s3://iceberg-bucket/ingest/ \\\\ --staging-hive-catalog hive_data \\\\ --staging-hive-schema staging \\\\ --create-if-not-exist \\\\ --trust-store-path \\${dir}/lh-ssl-ts.jks \\\\ --trust-store-password \\${keystore_password} \\\\ --dbuser ibmlhadmin \\\\ --dbpassword password EOF sed -i '/^$/d' ${staging}/ingest-local.sh chmod +x ${staging}/ingest-local.sh Start the Ingest Process Start the ibm-lh container again: ./ibm-lh data-copy Now run the ingest job inside the tool container. /mnt/infra/staging/ingest-local.sh Start data migration Ingesting SECTION: cmdline Reading parquet file:/staging/yellow_tripdata_2022-01.parquet Inferring source schema... Schema inferred Ingesting source folder s3://dev-bucket-01/ingest/stage_1686085369_19_ea7fa9994c96/ into target table ingest.yellow_tripdata_2022_01_localfile The specified table does not exist Target table does not exist.. creating Current State: RUNNING Rows Ingested: 408575 Current State: RUNNING Rows Ingested: 52 Current State: 100% FINISHED Done ingesting into table: ingest.yellow_tripdata_2022_01_localfile Complete migration After ingesting the data, exit the docker container. exit Refresh the IBM watsonx.data UI to view the iceberg_data catalog in the Data Explorer. Click on the yellow_tripdata table to see the schema definition. Then click on the Data sample tab to see a snippet of the data. Now we can use the UI to run a query against this imported data. Select the SQL icon on the left side of the display. On the line where the yellow_tripdate table is located, click the icon at the end of the name. This will display a drop-down list. Select \"Generate SELECT\". This will generate a SQL statement in the window to the right of the table name. Now execute the query to see what the results are. That completes the labs! Congratulations you are done!","title":"Ingesting Data"},{"location":"wxd-ingest/#ingesting-data","text":"In this lab we will use the ingest tool (lh-tool) alongside the IBM watsonx.data developer edition that is running in this lab. The Ingest tool is a separate install and currently needs to be downloaded after IBM watsonx.data is started. The lab image contains a copy of this code, so you will not need to download it. In addition, there is a staging file (yellowtaxi-parquet) found in the sample data directory that will be used for loading data into the system. As the root user, switch to the client bin directory. cd /root/ibm-lh-client/bin","title":"Ingesting Data"},{"location":"wxd-ingest/#ingest-data-into-the-ibm-watsonxdata","text":"Before running the utility, we need to retrieve several credentials for MinIO and the keystore password. export LH_S3_ACCESS_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_ACCESS_KEY | sed 's/.*=//') export LH_S3_SECRET_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_SECRET_KEY | sed 's/.*=//') export LH_KEYSTORE_PASSWORD=$(docker exec ibm-lh-presto printenv | grep LH_KEYSTORE_PASSWORD | sed 's/.*=//') We need to generate three export lines and create a directory that will be used later in another script. The staging directory is used to move files between this system and the docker container that the ibm-lh tool is running in. export staging=/root/ibm-lh-client/localstorage/volumes/infra/staging mkdir -p ${staging} cat <<EOF > ${staging}/keys.sh #!/bin/bash export access_key=$LH_S3_ACCESS_KEY export secret_key=$LH_S3_SECRET_KEY export keystore_password=$LH_KEYSTORE_PASSWORD EOF chmod +x ${staging}/keys.sh A copy of the MinIO SSL certificate needs to be copied from the docker container. In addition, the file that we want loaded into watsonx.data will be moved into the staging file directory. docker cp ibm-lh-presto:/mnt/infra/tls/lh-ssl-ts.jks ${staging}/lh-ssl-ts.jks \\cp -f /sampledata/yellow_tripdata_2022-01.parquet ${staging}/.","title":"Ingest data into the IBM watsonx.data"},{"location":"wxd-ingest/#create-a-hive-schema-for-staging-the-ingest-file","text":"Before ingesting the file, we need to create a new schema that we will use for the table being loaded. Your TechZone reservation will include the server name and port number to connect to the watsonx.data UI. In the watsonx.data UI select the Data Explorer. You should see a screen like the following. Use the \"Create\" pulldown and select Create schema in the hive_data catalog. Select the hive_data catalog and use staging as the new schema name. Press the Create button to finish the creation of the schema. You should see the new staging schema under hive_data . You need to repeat the same process again, but this time you are going to add a schema called ingest in the iceberg_data catalog. You should see the new ingest schema in the navigator screen.","title":"Create a hive schema for staging the ingest file"},{"location":"wxd-ingest/#start-the-ibm-tools-container","text":"To access the tools container, we issue the ibm-lh command with the data-load option. ./ibm-lh data-copy /////////////////////////////////////// /////////////////////////////////////// _ _ _ _ | |__ _, ,_ | || |_ _ | || '_ \\ / /\\//| |_ _| || |_ | | || |_) || | | |_ _| || | | | |_||_.__/ |_| |_| |_||_| |_| /////////////////////////////////////// /////////////////////////////////////// Once inside the utility, you can use the following command to get more details on the data-copy option. ibm-lh data-copy --help Exit from the tools container by using the exit command. exit The following script is used to ingest the taxi data (yellow_tripdata_2022_01) into watsonx.data. Choose the script which corresponds to the release of watsonx.data that you are currently running.","title":"Start the IBM tools Container"},{"location":"wxd-ingest/#watsonxdata-version-103","text":"cat <<EOF > ${staging}/ingest-local.sh #!/bin/bash dir=/mnt/infra/staging cd \\${dir} source ./keys.sh table_name=\"iceberg_data.ingest.yellow_tripdata_2022_01_localfile\" file=\"yellow_tripdata_2022-01.parquet\" ibm-lh data-copy \\\\ --source-data-files \\${dir}/\\${file} \\\\ --target-tables \\${table_name} \\\\ --ingestion-engine-endpoint \"hostname=ibm-lh-presto-svc,port=8443\" \\\\ --staging-location s3://iceberg-bucket/ingest/ \\ --staging-hive-catalog hive_data \\ --staging-hive-schema staging \\ --staging-s3-creds \\\\ \"AWS_SECRET_ACCESS_KEY=\\${secret_key}\\\\ ,AWS_ACCESS_KEY_ID=\\${access_key}\\\\ ,AWS_REGION=us-east-1\\\\ ,BUCKET_NAME=iceberg-bucket\\\\ ,ENDPOINT_URL=http://ibm-lh-minio:9000\" \\\\ --create-if-not-exist \\\\ --trust-store-path \\${dir}/lh-ssl-ts.jks \\\\ --trust-store-password \\${keystore_password} \\\\ --dbuser ibmlhadmin \\\\ --dbpassword password EOF sed -i '/^$/d' ${staging}/ingest-local.sh chmod +x ${staging}/ingest-local.sh","title":"Watsonx.data Version 1.0.3"},{"location":"wxd-ingest/#watsonxdata-version-110","text":"In version 1.1.0, the --staging-s3-creds are replaced with an environment variable called STAGING_S3_CREDS . cat <<EOF > ${staging}/ingest-local.sh #!/bin/bash dir=/mnt/infra/staging cd \\${dir} source ./keys.sh table_name=\"iceberg_data.ingest.yellow_tripdata_2022_01_localfile\" file=\"yellow_tripdata_2022-01.parquet\" export STAGING_S3_CREDS=\"AWS_SECRET_ACCESS_KEY=\\${secret_key}\\\\ ,AWS_ACCESS_KEY_ID=\\${access_key}\\\\ ,AWS_REGION=us-east-1\\\\ ,BUCKET_NAME=iceberg-bucket\\\\ ,ENDPOINT_URL=http://ibm-lh-minio:9000\" ibm-lh data-copy \\\\ --source-data-files \\${dir}/\\${file} \\\\ --target-tables \\${table_name} \\\\ --ingestion-engine-endpoint \"hostname=ibm-lh-presto-svc,port=8443\" \\\\ --staging-location s3://iceberg-bucket/ingest/ \\\\ --staging-hive-catalog hive_data \\\\ --staging-hive-schema staging \\\\ --create-if-not-exist \\\\ --trust-store-path \\${dir}/lh-ssl-ts.jks \\\\ --trust-store-password \\${keystore_password} \\\\ --dbuser ibmlhadmin \\\\ --dbpassword password EOF sed -i '/^$/d' ${staging}/ingest-local.sh chmod +x ${staging}/ingest-local.sh","title":"Watsonx.data Version 1.1.0"},{"location":"wxd-ingest/#start-the-ingest-process","text":"Start the ibm-lh container again: ./ibm-lh data-copy Now run the ingest job inside the tool container. /mnt/infra/staging/ingest-local.sh Start data migration Ingesting SECTION: cmdline Reading parquet file:/staging/yellow_tripdata_2022-01.parquet Inferring source schema... Schema inferred Ingesting source folder s3://dev-bucket-01/ingest/stage_1686085369_19_ea7fa9994c96/ into target table ingest.yellow_tripdata_2022_01_localfile The specified table does not exist Target table does not exist.. creating Current State: RUNNING Rows Ingested: 408575 Current State: RUNNING Rows Ingested: 52 Current State: 100% FINISHED Done ingesting into table: ingest.yellow_tripdata_2022_01_localfile Complete migration After ingesting the data, exit the docker container. exit Refresh the IBM watsonx.data UI to view the iceberg_data catalog in the Data Explorer. Click on the yellow_tripdata table to see the schema definition. Then click on the Data sample tab to see a snippet of the data. Now we can use the UI to run a query against this imported data. Select the SQL icon on the left side of the display. On the line where the yellow_tripdate table is located, click the icon at the end of the name. This will display a drop-down list. Select \"Generate SELECT\". This will generate a SQL statement in the window to the right of the table name. Now execute the query to see what the results are. That completes the labs! Congratulations you are done!","title":"Start the Ingest Process"},{"location":"wxd-intro/","text":"Introducing watsonx.data The next-gen watsonx.data lakehouse is designed to overcome the costs and complexities enterprises face. This will be the world\u2019s first and only open data store with multi-engine support that is built for hybrid deployment across your entire ecosystem. WatsonX.data is the only lakehouse with multiple query engines allowing you to optimize costs and performance by pairing the right workload with the right engine. Run all workloads from a single pane of glass, eliminating trade-offs with convenience while still improving cost and performance. Deploy anywhere with full support for hybrid-cloud and multi cloud environments. Shared metadata across multiple engines eliminates the need to re-catalog, accelerating time to value while ensuring governance and eliminating costly implementation efforts. This lab uses the watsonx.data developer package. The Developer package is meant to be used on single nodes. While it uses the same code base, there are some restrictions, especially on scale. In this lab, we will open some additional ports as well to understand how everything works. We will also use additional utilities to illustrate connectivity and what makes the watsonx.data system \"open\". We organized this lab into a number of sections that cover many of the highlights and key features of watsonx.data. Access a TechZone or VMWare image for testing Checking watsonx.data status Introduction to watsonx.data components Analytical SQL Advanced SQL functions Time Travel and Federation Working with Object Store Buckets In addition, there is an Appendix which includes common errors and potential fixes or workarounds. Watsonx.data Developer Image The watsonx.data system is running on a virtual machine with the following resources: 4 vCPUs 16Gb of memory 400Gb of disk This is sufficient for running this exercises found in this lab but should not be used for performance testing or dealing with large data sets. Watsonx.data Level 3 Technical Training This system is used as a basis for the watsonx.data Level 3 Technical Training. For the detailed lab material, please refer to the following PDF found in Seismic: https://ibm.seismic.com/Link/Content/DCG37pjmPj7VmGCHj2Df8fHVmDJj","title":"Introduction"},{"location":"wxd-intro/#introducing-watsonxdata","text":"The next-gen watsonx.data lakehouse is designed to overcome the costs and complexities enterprises face. This will be the world\u2019s first and only open data store with multi-engine support that is built for hybrid deployment across your entire ecosystem. WatsonX.data is the only lakehouse with multiple query engines allowing you to optimize costs and performance by pairing the right workload with the right engine. Run all workloads from a single pane of glass, eliminating trade-offs with convenience while still improving cost and performance. Deploy anywhere with full support for hybrid-cloud and multi cloud environments. Shared metadata across multiple engines eliminates the need to re-catalog, accelerating time to value while ensuring governance and eliminating costly implementation efforts. This lab uses the watsonx.data developer package. The Developer package is meant to be used on single nodes. While it uses the same code base, there are some restrictions, especially on scale. In this lab, we will open some additional ports as well to understand how everything works. We will also use additional utilities to illustrate connectivity and what makes the watsonx.data system \"open\". We organized this lab into a number of sections that cover many of the highlights and key features of watsonx.data. Access a TechZone or VMWare image for testing Checking watsonx.data status Introduction to watsonx.data components Analytical SQL Advanced SQL functions Time Travel and Federation Working with Object Store Buckets In addition, there is an Appendix which includes common errors and potential fixes or workarounds.","title":"Introducing watsonx.data"},{"location":"wxd-intro/#watsonxdata-developer-image","text":"The watsonx.data system is running on a virtual machine with the following resources: 4 vCPUs 16Gb of memory 400Gb of disk This is sufficient for running this exercises found in this lab but should not be used for performance testing or dealing with large data sets.","title":"Watsonx.data Developer Image"},{"location":"wxd-intro/#watsonxdata-level-3-technical-training","text":"This system is used as a basis for the watsonx.data Level 3 Technical Training. For the detailed lab material, please refer to the following PDF found in Seismic: https://ibm.seismic.com/Link/Content/DCG37pjmPj7VmGCHj2Df8fHVmDJj","title":"Watsonx.data Level 3 Technical Training"},{"location":"wxd-jupyter/","text":"Jupyter Notebook The watsonx.data server includes the Jupyter Notebook service which provides an interactive way of exploring the features of the Presto database. The link to the Jupyter Notebook table of contents is provided in your TechZone reservation. When you initially open the link, it will request a password to view the Table of Contents: The default password for the notebook is watsonx.data . Once you enter the password, the Table of Contents will be displayed. There are 11 notebooks provided in the system, and a brief description of the notebooks are found below. Introduction to Jupyter Notebooks If you are not familiar with the use of Jupyter notebooks, this will be a good starting point. To view the notebook, click on the blue arrow found at the bottom of the box. This will open a new tab in your browser with the contents of the notebook. This notebook provides an introduction to what Jupyter Notebooks are and what the common tasks are that you can perform in a notebook. Watsonx.data Credentials This is a key notebook for you to use during your work with the watsonx.data system. This notebook provides details on the userids and passwords for the services that are running in the server. There is no need to use a terminal command line to determine what the credentials are! In addition to the userids and passwords, this notebook provides a convenient way of downloading the certificate required to connect to the Presto database. Simply click on the certificate link and it will be downloaded to your local machine. Presto Magic Commands Magic commands are special macros found in Jupyter notebooks that simplify many tasks, including the ability to run SQL commands against a database. This notebook provides an introduction to what magic commands are and how you can use the Presto magic commands to connect and query the Presto database. Introduction to Presto SQL The watsonx.data lab has two ways of running SQL against the Presto database: Presto CLI commands Python/Pandas/Magic commands This notebook contains all the SQL that is run in the Presto SQL section of the lab. Instead of using the presto-cli command, this notebook uses magic commands to simplify the SQL execution. You can choose either method to explore Presto SQL. Presto Federation Presto provides the ability to federate queries across different servers. This notebook explores the ability to federate a PostgreSQL table with a table found in Presto. This lab requires some knowledge of the watsonx.data UI, so it is recommended you become familiar with the UI before running this lab. Python with watsonx.data Accessing the Presto database in Python requires the use of the prestodb module which implements features of the DBAPI standard. The notebook demonstrates how to connect to the database and retrieve results. Pandas Dataframes with watsonx.data Pandas dataframes are commonly used in Jupyter notebooks to analyze data. This code will connect to Presto using a Pandas dataframe and display some data from an existing table that was created in Presto. Note that the certificate required for this notebook is provided in the environment. Accessing watsonx.data with Spark This notebook demonstrates how Spark can connect to watsonx.data and manipulate the data. This system has a local, minimally configured Spark engine that will be used to access the Presto database. This engine is sufficient to demonstrate the steps needed to connect to watsonx.data and access the data that resides in the Presto catalogs. Connecting to Db2 This notebook demonstrates connecting to the local Db2 server using Jupyter notebooks. Connecting to PostgreSQL This notebook demonstrates connecting to the local PostgreSQL server using Jupyter notebooks. Connecting to MySQL This notebook demonstrates connecting to the local MySQL server using Jupyter notebooks.","title":"Jupyter Notebook"},{"location":"wxd-jupyter/#jupyter-notebook","text":"The watsonx.data server includes the Jupyter Notebook service which provides an interactive way of exploring the features of the Presto database. The link to the Jupyter Notebook table of contents is provided in your TechZone reservation. When you initially open the link, it will request a password to view the Table of Contents: The default password for the notebook is watsonx.data . Once you enter the password, the Table of Contents will be displayed. There are 11 notebooks provided in the system, and a brief description of the notebooks are found below.","title":"Jupyter Notebook"},{"location":"wxd-jupyter/#introduction-to-jupyter-notebooks","text":"If you are not familiar with the use of Jupyter notebooks, this will be a good starting point. To view the notebook, click on the blue arrow found at the bottom of the box. This will open a new tab in your browser with the contents of the notebook. This notebook provides an introduction to what Jupyter Notebooks are and what the common tasks are that you can perform in a notebook.","title":"Introduction to Jupyter Notebooks"},{"location":"wxd-jupyter/#watsonxdata-credentials","text":"This is a key notebook for you to use during your work with the watsonx.data system. This notebook provides details on the userids and passwords for the services that are running in the server. There is no need to use a terminal command line to determine what the credentials are! In addition to the userids and passwords, this notebook provides a convenient way of downloading the certificate required to connect to the Presto database. Simply click on the certificate link and it will be downloaded to your local machine.","title":"Watsonx.data Credentials"},{"location":"wxd-jupyter/#presto-magic-commands","text":"Magic commands are special macros found in Jupyter notebooks that simplify many tasks, including the ability to run SQL commands against a database. This notebook provides an introduction to what magic commands are and how you can use the Presto magic commands to connect and query the Presto database.","title":"Presto Magic Commands"},{"location":"wxd-jupyter/#introduction-to-presto-sql","text":"The watsonx.data lab has two ways of running SQL against the Presto database: Presto CLI commands Python/Pandas/Magic commands This notebook contains all the SQL that is run in the Presto SQL section of the lab. Instead of using the presto-cli command, this notebook uses magic commands to simplify the SQL execution. You can choose either method to explore Presto SQL.","title":"Introduction to Presto SQL"},{"location":"wxd-jupyter/#presto-federation","text":"Presto provides the ability to federate queries across different servers. This notebook explores the ability to federate a PostgreSQL table with a table found in Presto. This lab requires some knowledge of the watsonx.data UI, so it is recommended you become familiar with the UI before running this lab.","title":"Presto Federation"},{"location":"wxd-jupyter/#python-with-watsonxdata","text":"Accessing the Presto database in Python requires the use of the prestodb module which implements features of the DBAPI standard. The notebook demonstrates how to connect to the database and retrieve results.","title":"Python with watsonx.data"},{"location":"wxd-jupyter/#pandas-dataframes-with-watsonxdata","text":"Pandas dataframes are commonly used in Jupyter notebooks to analyze data. This code will connect to Presto using a Pandas dataframe and display some data from an existing table that was created in Presto. Note that the certificate required for this notebook is provided in the environment.","title":"Pandas Dataframes with watsonx.data"},{"location":"wxd-jupyter/#accessing-watsonxdata-with-spark","text":"This notebook demonstrates how Spark can connect to watsonx.data and manipulate the data. This system has a local, minimally configured Spark engine that will be used to access the Presto database. This engine is sufficient to demonstrate the steps needed to connect to watsonx.data and access the data that resides in the Presto catalogs.","title":"Accessing watsonx.data with Spark"},{"location":"wxd-jupyter/#connecting-to-db2","text":"This notebook demonstrates connecting to the local Db2 server using Jupyter notebooks.","title":"Connecting to Db2"},{"location":"wxd-jupyter/#connecting-to-postgresql","text":"This notebook demonstrates connecting to the local PostgreSQL server using Jupyter notebooks.","title":"Connecting to PostgreSQL"},{"location":"wxd-jupyter/#connecting-to-mysql","text":"This notebook demonstrates connecting to the local MySQL server using Jupyter notebooks.","title":"Connecting to MySQL"},{"location":"wxd-lab-instructions/","text":"Lab Instructions URL Conventions Your TechZone reservation contains a number of URLs for the services provided in the watsonx.data server. The URL will contain the name of the server and the corresponding port number for the service. Throughout the documentation, the server name will be referred to as region.techzone-server.com and port number is referred to as port . Where you see these URLs, replace them with the values found in your reservation. Commands Throughout the labs, any command that needs to be executed will be highlighted in a grey box: cd /root/ibm-lh-dev/bin A copy icon is usually found on the far right-hand side of the command box. Use this to copy the text and paste it into your command window. You can also select the text and copy it that way. Notes : Some commands may span multiple lines, so make sure you copy everything in the box if you are not using the copy button Commands pasted into a terminal window will require that you hit the Return or Enter key for the command to be executed Commands pasted into a Presto CLI window will execute automatically System Check The watsonx.data server automatically starts all services except for Apache Superset and the VNC service. To check the status of the server, run the following commands. Make sure that you have an open terminal session and use the following command to connect to the watsonx.data server. ssh -p port watsonx@region.techzone-server.com Password is watsonx.data . Next switch to the root userid. sudo su - Switch to the development code bin directory. cd /root/ibm-lh-dev/bin Once you have switched to the development directory, you can start running watsonx.data commands. You can check the status with the following command. ./status.sh --all Output will look similar to: using /root/ibm-lh-dev/localstorage/volumes as data root directory for user: root/1001 infra config location is /root/ibm-lh-dev/localstorage/volumes/infra lhconsole-ui running 0.0.0.0:9443->8443/tcp, :::9443->8443/tcp lhconsole-nodeclient-svc running 3001/tcp lhconsole-javaapi-svc running 8090/tcp lhconsole-api running 3333/tcp, 8081/tcp ibm-lh-presto running 0.0.0.0:8443->8443/tcp, :::8443->8443/tcp ibm-lh-hive-metastore running ibm-lh-postgres running 5432/tcp ibm-lh-minio running To confirm that the software is working, run the following commands to validate the installation. Presto Engine Test Check the Presto engine by connecting to a schema. First, we need to make sure that the Presto engine has completed all startup tasks. The following command is not part of watsonx.data, but has been included to simplify checking the status of the Presto service. check_presto Waiting for Presto to start. ........................... Ready Note : If the starting message may take up to 5 minutes when the system first initializes. Once the command returns \"Ready\" you can connect to the presto CLI. ./presto-cli --catalog tpch --schema tiny Note : If the Presto engine has not yet started (you didn't run the check_presto script), the next command may result in a useless Java error message. You may need to wait for a minute for attempting to run the statement again. Check the record count of the customer table. select * from customer limit 10; All Presto commands end with a semi-colon. The result set should include the a number of rows (the results will be random). custkey | name | address | nationkey | phone | acctbal | mktsegment | comment ---------+--------------------+---------------------------------------+-----------+-----------------+---------+------------+------------------------------------------------------------------------------------------------------------------- 1 | Customer#000000001 | IVhzIApeRb ot,c,E | 15 | 25-989-741-2988 | 711.56 | BUILDING | to the even, regular platelets. regular, ironic epitaphs nag e 2 | Customer#000000002 | XSTf4,NCwDVaWNe6tEgvwfmRchLXak | 13 | 23-768-687-3665 | 121.65 | AUTOMOBILE | l accounts. blithely ironic theodolites integrate boldly: caref 3 | Customer#000000003 | MG9kdTD2WBHm | 1 | 11-719-748-3364 | 7498.12 | AUTOMOBILE | deposits eat slyly ironic, even instructions. express foxes detect slyly. blithely even accounts abov 4 | Customer#000000004 | XxVSJsLAGtn | 4 | 14-128-190-5944 | 2866.83 | MACHINERY | requests. final, regular ideas sleep final accou 5 | Customer#000000005 | KvpyuHCplrB84WgAiGV6sYpZq7Tj | 3 | 13-750-942-6364 | 794.47 | HOUSEHOLD | n accounts will have to unwind. foxes cajole accor 6 | Customer#000000006 | sKZz0CsnMD7mp4Xd0YrBvx,LREYKUWAh yVn | 20 | 30-114-968-4951 | 7638.57 | AUTOMOBILE | tions. even deposits boost according to the slyly bold packages. final accounts cajole requests. furious 7 | Customer#000000007 | TcGe5gaZNgVePxU5kRrvXBfkasDTea | 18 | 28-190-982-9759 | 9561.95 | AUTOMOBILE | ainst the ironic, express theodolites. express, even pinto beans among the exp 8 | Customer#000000008 | I0B10bB0AymmC, 0PrRYBCP1yGJ8xcBPmWhl5 | 17 | 27-147-574-9335 | 6819.74 | BUILDING | among the slyly regular theodolites kindle blithely courts. carefully even theodolites haggle slyly along the ide 9 | Customer#000000009 | xKiAFTjUsCuxfeleNqefumTrjS | 8 | 18-338-906-3675 | 8324.07 | FURNITURE | r theodolites according to the requests wake thinly excuses: pending requests haggle furiousl 10 | Customer#000000010 | 6LrEaV6KR6PLVcgl2ArL Q3rqzLzcT1 v2 | 5 | 15-741-346-9870 | 2753.54 | HOUSEHOLD | es regular deposits haggle. fur (10 rows) The output on your screen will look similar to the following: The arrows on the far right side indicate that there is more output to view. Press the right and left arrows on your keyboard to scroll the display. If the result set is small, all of the results will display on the screen and no scrolling will be available unless the results are wider than the screen size. When thje display shows (END) you have reached the bottom of the output. If the display shows a colon ( : ) at the bottom of the screen, you can use the up and down arrow keys to scroll a record at a time, or the Page Up and Page Down keys to scroll a page at a time. To quit viewing the output, press the Q key. Quit the Presto CLI. The Presto quit command can be used with or without a semicolon. quit; Congratulations, your system is now up and running!","title":"Lab Instructions"},{"location":"wxd-lab-instructions/#lab-instructions","text":"","title":"Lab Instructions"},{"location":"wxd-lab-instructions/#url-conventions","text":"Your TechZone reservation contains a number of URLs for the services provided in the watsonx.data server. The URL will contain the name of the server and the corresponding port number for the service. Throughout the documentation, the server name will be referred to as region.techzone-server.com and port number is referred to as port . Where you see these URLs, replace them with the values found in your reservation.","title":"URL Conventions"},{"location":"wxd-lab-instructions/#commands","text":"Throughout the labs, any command that needs to be executed will be highlighted in a grey box: cd /root/ibm-lh-dev/bin A copy icon is usually found on the far right-hand side of the command box. Use this to copy the text and paste it into your command window. You can also select the text and copy it that way. Notes : Some commands may span multiple lines, so make sure you copy everything in the box if you are not using the copy button Commands pasted into a terminal window will require that you hit the Return or Enter key for the command to be executed Commands pasted into a Presto CLI window will execute automatically","title":"Commands"},{"location":"wxd-lab-instructions/#system-check","text":"The watsonx.data server automatically starts all services except for Apache Superset and the VNC service. To check the status of the server, run the following commands. Make sure that you have an open terminal session and use the following command to connect to the watsonx.data server. ssh -p port watsonx@region.techzone-server.com Password is watsonx.data . Next switch to the root userid. sudo su - Switch to the development code bin directory. cd /root/ibm-lh-dev/bin Once you have switched to the development directory, you can start running watsonx.data commands. You can check the status with the following command. ./status.sh --all Output will look similar to: using /root/ibm-lh-dev/localstorage/volumes as data root directory for user: root/1001 infra config location is /root/ibm-lh-dev/localstorage/volumes/infra lhconsole-ui running 0.0.0.0:9443->8443/tcp, :::9443->8443/tcp lhconsole-nodeclient-svc running 3001/tcp lhconsole-javaapi-svc running 8090/tcp lhconsole-api running 3333/tcp, 8081/tcp ibm-lh-presto running 0.0.0.0:8443->8443/tcp, :::8443->8443/tcp ibm-lh-hive-metastore running ibm-lh-postgres running 5432/tcp ibm-lh-minio running To confirm that the software is working, run the following commands to validate the installation.","title":"System Check"},{"location":"wxd-lab-instructions/#presto-engine-test","text":"Check the Presto engine by connecting to a schema. First, we need to make sure that the Presto engine has completed all startup tasks. The following command is not part of watsonx.data, but has been included to simplify checking the status of the Presto service. check_presto Waiting for Presto to start. ........................... Ready Note : If the starting message may take up to 5 minutes when the system first initializes. Once the command returns \"Ready\" you can connect to the presto CLI. ./presto-cli --catalog tpch --schema tiny Note : If the Presto engine has not yet started (you didn't run the check_presto script), the next command may result in a useless Java error message. You may need to wait for a minute for attempting to run the statement again. Check the record count of the customer table. select * from customer limit 10; All Presto commands end with a semi-colon. The result set should include the a number of rows (the results will be random). custkey | name | address | nationkey | phone | acctbal | mktsegment | comment ---------+--------------------+---------------------------------------+-----------+-----------------+---------+------------+------------------------------------------------------------------------------------------------------------------- 1 | Customer#000000001 | IVhzIApeRb ot,c,E | 15 | 25-989-741-2988 | 711.56 | BUILDING | to the even, regular platelets. regular, ironic epitaphs nag e 2 | Customer#000000002 | XSTf4,NCwDVaWNe6tEgvwfmRchLXak | 13 | 23-768-687-3665 | 121.65 | AUTOMOBILE | l accounts. blithely ironic theodolites integrate boldly: caref 3 | Customer#000000003 | MG9kdTD2WBHm | 1 | 11-719-748-3364 | 7498.12 | AUTOMOBILE | deposits eat slyly ironic, even instructions. express foxes detect slyly. blithely even accounts abov 4 | Customer#000000004 | XxVSJsLAGtn | 4 | 14-128-190-5944 | 2866.83 | MACHINERY | requests. final, regular ideas sleep final accou 5 | Customer#000000005 | KvpyuHCplrB84WgAiGV6sYpZq7Tj | 3 | 13-750-942-6364 | 794.47 | HOUSEHOLD | n accounts will have to unwind. foxes cajole accor 6 | Customer#000000006 | sKZz0CsnMD7mp4Xd0YrBvx,LREYKUWAh yVn | 20 | 30-114-968-4951 | 7638.57 | AUTOMOBILE | tions. even deposits boost according to the slyly bold packages. final accounts cajole requests. furious 7 | Customer#000000007 | TcGe5gaZNgVePxU5kRrvXBfkasDTea | 18 | 28-190-982-9759 | 9561.95 | AUTOMOBILE | ainst the ironic, express theodolites. express, even pinto beans among the exp 8 | Customer#000000008 | I0B10bB0AymmC, 0PrRYBCP1yGJ8xcBPmWhl5 | 17 | 27-147-574-9335 | 6819.74 | BUILDING | among the slyly regular theodolites kindle blithely courts. carefully even theodolites haggle slyly along the ide 9 | Customer#000000009 | xKiAFTjUsCuxfeleNqefumTrjS | 8 | 18-338-906-3675 | 8324.07 | FURNITURE | r theodolites according to the requests wake thinly excuses: pending requests haggle furiousl 10 | Customer#000000010 | 6LrEaV6KR6PLVcgl2ArL Q3rqzLzcT1 v2 | 5 | 15-741-346-9870 | 2753.54 | HOUSEHOLD | es regular deposits haggle. fur (10 rows) The output on your screen will look similar to the following: The arrows on the far right side indicate that there is more output to view. Press the right and left arrows on your keyboard to scroll the display. If the result set is small, all of the results will display on the screen and no scrolling will be available unless the results are wider than the screen size. When thje display shows (END) you have reached the bottom of the output. If the display shows a colon ( : ) at the bottom of the screen, you can use the up and down arrow keys to scroll a record at a time, or the Page Up and Page Down keys to scroll a page at a time. To quit viewing the output, press the Q key. Quit the Presto CLI. The Presto quit command can be used with or without a semicolon. quit; Congratulations, your system is now up and running!","title":"Presto Engine Test"},{"location":"wxd-minio/","text":"Using the MinIO console UI MinIO is a high-performance, S3 compatible object store. Rather than connect to an external S3 object store, we are going to use MinIO locally to run with watsonx.data. To connect to MinIO, you will need to extract the MinIO credentials by querying the docker container. You must be the root user to issue these commands. export LH_S3_ACCESS_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_ACCESS_KEY | sed 's/.*=//') export LH_S3_SECRET_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_SECRET_KEY | sed 's/.*=//') echo \"MinIO Userid : \" $LH_S3_ACCESS_KEY echo \"MinIO Password: \" $LH_S3_SECRET_KEY MinIO Userid : c4643026087cc21989eb5c12 MinIO Password: 93da45c5af87abd86c9dbc83 You can get all passwords for the system when you are logged in as the watsonx user by using the following command. cat /certs/passwords Your TechZone reservation will include the server name and port number to use when connecting to the MinIO. The default port number is 9001, while the server will be referred to as region.techzone-server.com . Replace these values with those found in your reservation. Open your browser and navigate to: Minio console - http://region.techzone-server.com:port Note : Firefox on OSX occasionally freezes when connecting to the MinIO console. The Safari browser is much more reliable. Login with object store credentials found above (These will be different for your system). You should see current buckets in MinIO. We are going to examine these buckets after we populate them with some data. Creating Schemas and Tables Not all catalogs support creation of schemas - as an example, the TPCH catalog is not writeable. We will use the iceberg_data catalog for this exercise. We will need to get some details before we continue. Make sure you are connected as the root user and are in the proper directory. cd /root/ibm-lh-dev/bin Login to the Presto CLI. ./presto-cli --catalog iceberg_data Create schema workshop in catalog iceberg_data . Note how we are using the iceberg-bucket bucket which you should have seen in the MinIO object browser. CREATE SCHEMA IF NOT EXISTS workshop with (location='s3a://iceberg-bucket/'); Show the schemas available. show schemas; Schema ---------- workshop (1 row) Use the workshop schema. use workshop; Creating tables Create a new Apache Iceberg table using existing data in the sample Customer table as part of the TPCH catalog schema called TINY. create table customer as select * from tpch.tiny.customer; Show the tables. show tables; Table ---------- customer (1 row) Quit Presto. quit; \u2003 Refresh the Minio screen (see button on the far-right side). You should now see new objects under iceberg-bucket Click on the bucket name and you will see the customer table. Selecting the customer object will show that there is data and metadata in there. How do we know that this data is based on Apache iceberg? If you open the file under metadata , you should see metadata information for the data we are storing in parquet file format. Do I really need Apache Iceberg? YES, YOU DO! However, it is good to understand why? Metadata is also stored in the Parquet file format but only for the single parquet file. If we add more data/partitions, the data is split into multiple Parquet files, and we don\u2019t have a mechanism to get the table to parquet files mapping. Run the following example to understand this better. You need to get the access keys for MinIO before running the following lab. Make sure you are still connected as root . export LH_S3_ACCESS_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_ACCESS_KEY | sed 's/.*=//') export LH_S3_SECRET_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_SECRET_KEY | sed 's/.*=//') Open the developer sandbox to connect to MinIO, download the selected parquet file and inspect the parquet file contents. ./dev-sandbox.sh Update the Python files to be executable (makes our commands more convenient). chmod +x /scripts/*.py List all files in the object store (MinIO). /scripts/s3-inspect.py --host ibm-lh-minio-svc:9000 --accessKey $LH_S3_ACCESS_KEY --secretKey $LH_S3_SECRET_KEY --bucket iceberg-bucket iceberg-bucket b'customer/data/e9536a5e-14a1-4823-98ed-cc22d6fc38db.parquet' 2023-06-06 14:31:47.778000+00:00 6737d7268fcb3eb459b675f27f716f48 75373 None iceberg-bucket b'customer/metadata/00000-e26c56e0-c4d7-4625-8b06-422429f6ba8d.metadata.json' 2023-06-06 14:31:48.629000+00:00 2e722c7dd83c1dd260a7e6c9503c0e04 3272 None iceberg-bucket b'customer/metadata/7cb074a4-3da7-4184-9db8-567383bb588a-m0.avro' 2023-06-06 14:31:48.401000+00:00 655a5568207cc399b8297f1488ef77e7 6342 None iceberg-bucket b'customer/metadata/snap-6143645832277262458-1-7cb074a4-3da7-4184-9db8-567383bb588a.avro' 2023-06-06 14:31:48.445000+00:00 0c3714299d43ae86a46eabdcaac1351e 3753 None You can extract the string with the following command. PARQUET=$(/scripts/s3-inspect.py --host ibm-lh-minio-svc:9000 --accessKey $LH_S3_ACCESS_KEY --secretKey $LH_S3_SECRET_KEY --bucket iceberg-bucket | grep -o -m 1 \".*'customer.*parquet\" | sed -n \"s/.*b'//p\") The file name that is retrieved is substituted into the next command. Note: The file name found in $PARQUET will be different on your system. /scripts/s3-download.py --host ibm-lh-minio-svc:9000 --accessKey $LH_S3_ACCESS_KEY --secretKey $LH_S3_SECRET_KEY --bucket iceberg-bucket --srcFile $PARQUET --destFile /tmp/x.parquet \u2003 Describe the File Contents. /scripts/describe-parquet.py /tmp/x.parquet ---------------------- metadata: created_by: num_columns: 8 num_rows: 1500 num_row_groups: 1 format_version: 1.0 serialized_size: 851 ---------------------- ---------------------- schema: custkey: int64 name: binary address: binary nationkey: int64 phone: binary acctbal: double mktsegment: binary comment: binary ---------------------- ---------------------- row group 0: num_columns: 8 num_rows: 1500 total_byte_size: 74555 ---------------------- ---------------------- row group 0, column 1: file_offset: 0 file_path: physical_type: BYTE_ARRAY num_values: 1500 path_in_schema: name is_stats_set: True statistics: has_min_max: False min: None max: None null_count: 0 distinct_count: 0 num_values: 1500 physical_type: BYTE_ARRAY logical_type: None converted_type (legacy): NONE compression: GZIP encodings: ('DELTA_BYTE_ARRAY',) has_dictionary_page: False dictionary_page_offset: None data_page_offset: 112 total_compressed_size: 599 total_uncompressed_size: 2806 ---------------------- Note : In this instance we used an insert into select * from customer with no partitioning defined there was only 1 parquet file and only 1 row group. This is not the norm, and we deliberately did this to show you the value of using Apache Iceberg file format which can be used by multiple runtimes to access Iceberg data stored in parquet format and managed by hive metastore. Exit from the Sandbox. exit MinIO CLI The MinIO Client mc command line tool provides an alternative to UNIX commands like ls , cat , cp , mirror , and diff with support for both file systems and Amazon S3-compatible cloud storage services. The mc commandline tool is built for compatibility with the AWS S3 API and is tested with MinIO and AWS S3 for expected functionality and behavior. Complete details and restrictions around the use of the CLI command can be found on the MinIO Client page. You can use the MinIO CLI from a variety of clients. The MinIO ports are open in the developer edition image, which provides an alternative to loading data directly from your workstation rather than using the MinIO UI interface. Minio System Alias Before running commands against the MinIO server, an alias must be created that includes the access and secret key. The values can be extracted from the system by listing the contents of the /certs/passwords file or by running the passwords command as the root user. cat /certs/passwords The values for the MinIO access and secret key can also be exported with the following code: export LH_S3_ACCESS_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_ACCESS_KEY | sed 's/.*=//') export LH_S3_SECRET_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_SECRET_KEY | sed 's/.*=//') echo \"MinIO Userid : \" $LH_S3_ACCESS_KEY echo \"MinIO Password: \" $LH_S3_SECRET_KEY The alias command has the following syntax: mc alias set alias-name hostname:port access_key secret_key For a local connection, we will use the following values: Alias Name - watsonxdata Hostname \u2013 watsonxdata Port \u2013 9000 Access Key \u2013 $LH_S3_ACCESS_KEY Secret Key - $LH_S3_SECRET_KEY If you are using an external client to connect to the MinIO service, you will need the URL and Port number from the TechZone reservation. The access key and secret key will be the same values that are found above. Hostname \u2013 region.techzone-server.com Port \u2013 12345 The alias for local access is found below. mc alias set watsonxdata http://watsonxdata:9000 $LH_S3_ACCESS_KEY $LH_S3_SECRET_KEY Added `watsonxdata` successfully. List Buckets The mc command provides us with a number of commands that allows us to manage buckets and files within them. The following command checks to see what buckets currently exist in the system. mc ls tree watsonxdata [2023-09-29 14:38:19 EDT] 0B hive-bucket/ [2023-09-29 14:38:19 EDT] 0B iceberg-bucket/ You can view the contents of a bucket by traversing down the path. mc ls tree watsonxdata/hive-bucket [2023-10-13 10:34:36 EDT] 0B gosalesdw/ [2023-10-13 10:34:36 EDT] 0B hive_sql/ [2023-10-13 10:34:36 EDT] 0B ontime/ [2023-10-13 10:34:36 EDT] 0B taxi/ Create a Bucket At this point we will create a new bucket to hold some data. Use the mb (make bucket) command. The command requires the alias name for the MinIO connection followed by the name of the bucket. mc mb alias-name/new-bucket The following code will create a new bucket in the system called sampledata . mc mb watsonxdata/sampledata Bucket created successfully `watsonxdata/sampledata`. We can double check that the bucket it there. mc ls tree watsonxdata [2023-09-29 14:38:19 EDT] 0B hive-bucket/ [2023-09-29 14:38:19 EDT] 0B iceberg-bucket/ [2023-10-13 10:39:47 EDT] 0B sampledata/ Loading Data One of the most powerful features of the MinIO CLI is its ability to load data directory from your workstation into the bucket, rather than having to use the MinIO UI. It is also significantly faster than using the UI interface. The next example will load data into the bucket that was just created. The directory that we will be using to load data from is called /sampledata and found in the root directory of the watsonx.data server. ls /sampledata/csv gosales ontime taxi Next we will load the data from each one of these directories into the sampledata bucket. The mc command allows you to select which files to place into a bucket, or an entire directory with recursion. In this case we are loading all three directories the files into the bucket. Note the use of the / at the end of the directory name to prevent the directory name csv from being used as the high-level directory name in the target bucket. mc cp --recursive /sampledata/csv/ watsonxdata/sampledata/ ...data/csv/taxi/taxi.csv: 306.16 MiB / 306.16 MiB \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 147.91 MiB/s 2s We can double-check that our files are there with the --files option: mc tree --files watsonxdata/sampledata/ watsonxdata/sampledata/ \u251c\u2500 gosales \u2502 \u251c\u2500 DIST_INVENTORY_FACT.csv \u2502 \u251c\u2500 DIST_PRODUCT_FORECAST_FACT.csv \u2502 \u251c\u2500 DIST_RETURNED_ITEMS_FACT.csv \u2502 \u251c\u2500 DIST_RETURN_REASON_DIM.csv .... \u2502 \u251c\u2500 EMP_EMPLOYEE_DIM.csv \u2502 \u251c\u2500 SLS_SALES_TARG_FACT.csv \u2502 \u251c\u2500 gosales_createtable.sql \u2502 \u2514\u2500 gosales_load_postgres.sql \u251c\u2500 ontime \u2502 \u251c\u2500 aircraft.csv \u2502 \u251c\u2500 airline_id.csv \u2502 \u251c\u2500 airport_id.csv \u2502 \u251c\u2500 cancellation.csv \u2502 \u2514\u2500 ontime.csv \u2514\u2500 taxi \u2514\u2500 taxi.csv Delete a File or Bucket Use the rb (Remove bucket) command to remove a bucket and its contents. You can remove individual objects by using the rm (Remove) command by fully qualifying the object. The next command will remove the ontime.csv file from the ontime folder. mc rm watsonxdata/sampledata/ontime/ontime.csv Removed `watsonxdata/sampledata/ontime/ontime.csv`. The delete bucket command will fail if you still have data in the bucket. mc rb watsonxdata/sampledata mc: `watsonxdata/sampledata` is not empty. Retry this command with \u2018--force\u2019 flag if you want to remove `watsonxdata/sampledata` and all its contents Adding the --force option will remove the bucket and all the data in it. Use with caution! mc rb --force watsonxdata/sampledata Removed `watsonxdata/sampledata` successfully.","title":"MinIO UI"},{"location":"wxd-minio/#using-the-minio-console-ui","text":"MinIO is a high-performance, S3 compatible object store. Rather than connect to an external S3 object store, we are going to use MinIO locally to run with watsonx.data. To connect to MinIO, you will need to extract the MinIO credentials by querying the docker container. You must be the root user to issue these commands. export LH_S3_ACCESS_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_ACCESS_KEY | sed 's/.*=//') export LH_S3_SECRET_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_SECRET_KEY | sed 's/.*=//') echo \"MinIO Userid : \" $LH_S3_ACCESS_KEY echo \"MinIO Password: \" $LH_S3_SECRET_KEY MinIO Userid : c4643026087cc21989eb5c12 MinIO Password: 93da45c5af87abd86c9dbc83 You can get all passwords for the system when you are logged in as the watsonx user by using the following command. cat /certs/passwords Your TechZone reservation will include the server name and port number to use when connecting to the MinIO. The default port number is 9001, while the server will be referred to as region.techzone-server.com . Replace these values with those found in your reservation. Open your browser and navigate to: Minio console - http://region.techzone-server.com:port Note : Firefox on OSX occasionally freezes when connecting to the MinIO console. The Safari browser is much more reliable. Login with object store credentials found above (These will be different for your system). You should see current buckets in MinIO. We are going to examine these buckets after we populate them with some data.","title":"Using the MinIO console UI"},{"location":"wxd-minio/#creating-schemas-and-tables","text":"Not all catalogs support creation of schemas - as an example, the TPCH catalog is not writeable. We will use the iceberg_data catalog for this exercise. We will need to get some details before we continue. Make sure you are connected as the root user and are in the proper directory. cd /root/ibm-lh-dev/bin Login to the Presto CLI. ./presto-cli --catalog iceberg_data Create schema workshop in catalog iceberg_data . Note how we are using the iceberg-bucket bucket which you should have seen in the MinIO object browser. CREATE SCHEMA IF NOT EXISTS workshop with (location='s3a://iceberg-bucket/'); Show the schemas available. show schemas; Schema ---------- workshop (1 row) Use the workshop schema. use workshop;","title":"Creating Schemas and Tables"},{"location":"wxd-minio/#creating-tables","text":"Create a new Apache Iceberg table using existing data in the sample Customer table as part of the TPCH catalog schema called TINY. create table customer as select * from tpch.tiny.customer; Show the tables. show tables; Table ---------- customer (1 row) Quit Presto. quit; \u2003 Refresh the Minio screen (see button on the far-right side). You should now see new objects under iceberg-bucket Click on the bucket name and you will see the customer table. Selecting the customer object will show that there is data and metadata in there. How do we know that this data is based on Apache iceberg? If you open the file under metadata , you should see metadata information for the data we are storing in parquet file format.","title":"Creating tables"},{"location":"wxd-minio/#do-i-really-need-apache-iceberg","text":"YES, YOU DO! However, it is good to understand why? Metadata is also stored in the Parquet file format but only for the single parquet file. If we add more data/partitions, the data is split into multiple Parquet files, and we don\u2019t have a mechanism to get the table to parquet files mapping. Run the following example to understand this better. You need to get the access keys for MinIO before running the following lab. Make sure you are still connected as root . export LH_S3_ACCESS_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_ACCESS_KEY | sed 's/.*=//') export LH_S3_SECRET_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_SECRET_KEY | sed 's/.*=//') Open the developer sandbox to connect to MinIO, download the selected parquet file and inspect the parquet file contents. ./dev-sandbox.sh Update the Python files to be executable (makes our commands more convenient). chmod +x /scripts/*.py List all files in the object store (MinIO). /scripts/s3-inspect.py --host ibm-lh-minio-svc:9000 --accessKey $LH_S3_ACCESS_KEY --secretKey $LH_S3_SECRET_KEY --bucket iceberg-bucket iceberg-bucket b'customer/data/e9536a5e-14a1-4823-98ed-cc22d6fc38db.parquet' 2023-06-06 14:31:47.778000+00:00 6737d7268fcb3eb459b675f27f716f48 75373 None iceberg-bucket b'customer/metadata/00000-e26c56e0-c4d7-4625-8b06-422429f6ba8d.metadata.json' 2023-06-06 14:31:48.629000+00:00 2e722c7dd83c1dd260a7e6c9503c0e04 3272 None iceberg-bucket b'customer/metadata/7cb074a4-3da7-4184-9db8-567383bb588a-m0.avro' 2023-06-06 14:31:48.401000+00:00 655a5568207cc399b8297f1488ef77e7 6342 None iceberg-bucket b'customer/metadata/snap-6143645832277262458-1-7cb074a4-3da7-4184-9db8-567383bb588a.avro' 2023-06-06 14:31:48.445000+00:00 0c3714299d43ae86a46eabdcaac1351e 3753 None You can extract the string with the following command. PARQUET=$(/scripts/s3-inspect.py --host ibm-lh-minio-svc:9000 --accessKey $LH_S3_ACCESS_KEY --secretKey $LH_S3_SECRET_KEY --bucket iceberg-bucket | grep -o -m 1 \".*'customer.*parquet\" | sed -n \"s/.*b'//p\") The file name that is retrieved is substituted into the next command. Note: The file name found in $PARQUET will be different on your system. /scripts/s3-download.py --host ibm-lh-minio-svc:9000 --accessKey $LH_S3_ACCESS_KEY --secretKey $LH_S3_SECRET_KEY --bucket iceberg-bucket --srcFile $PARQUET --destFile /tmp/x.parquet \u2003 Describe the File Contents. /scripts/describe-parquet.py /tmp/x.parquet ---------------------- metadata: created_by: num_columns: 8 num_rows: 1500 num_row_groups: 1 format_version: 1.0 serialized_size: 851 ---------------------- ---------------------- schema: custkey: int64 name: binary address: binary nationkey: int64 phone: binary acctbal: double mktsegment: binary comment: binary ---------------------- ---------------------- row group 0: num_columns: 8 num_rows: 1500 total_byte_size: 74555 ---------------------- ---------------------- row group 0, column 1: file_offset: 0 file_path: physical_type: BYTE_ARRAY num_values: 1500 path_in_schema: name is_stats_set: True statistics: has_min_max: False min: None max: None null_count: 0 distinct_count: 0 num_values: 1500 physical_type: BYTE_ARRAY logical_type: None converted_type (legacy): NONE compression: GZIP encodings: ('DELTA_BYTE_ARRAY',) has_dictionary_page: False dictionary_page_offset: None data_page_offset: 112 total_compressed_size: 599 total_uncompressed_size: 2806 ---------------------- Note : In this instance we used an insert into select * from customer with no partitioning defined there was only 1 parquet file and only 1 row group. This is not the norm, and we deliberately did this to show you the value of using Apache Iceberg file format which can be used by multiple runtimes to access Iceberg data stored in parquet format and managed by hive metastore. Exit from the Sandbox. exit","title":"Do I really need Apache Iceberg?"},{"location":"wxd-minio/#minio-cli","text":"The MinIO Client mc command line tool provides an alternative to UNIX commands like ls , cat , cp , mirror , and diff with support for both file systems and Amazon S3-compatible cloud storage services. The mc commandline tool is built for compatibility with the AWS S3 API and is tested with MinIO and AWS S3 for expected functionality and behavior. Complete details and restrictions around the use of the CLI command can be found on the MinIO Client page. You can use the MinIO CLI from a variety of clients. The MinIO ports are open in the developer edition image, which provides an alternative to loading data directly from your workstation rather than using the MinIO UI interface.","title":"MinIO CLI"},{"location":"wxd-minio/#minio-system-alias","text":"Before running commands against the MinIO server, an alias must be created that includes the access and secret key. The values can be extracted from the system by listing the contents of the /certs/passwords file or by running the passwords command as the root user. cat /certs/passwords The values for the MinIO access and secret key can also be exported with the following code: export LH_S3_ACCESS_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_ACCESS_KEY | sed 's/.*=//') export LH_S3_SECRET_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_SECRET_KEY | sed 's/.*=//') echo \"MinIO Userid : \" $LH_S3_ACCESS_KEY echo \"MinIO Password: \" $LH_S3_SECRET_KEY The alias command has the following syntax: mc alias set alias-name hostname:port access_key secret_key For a local connection, we will use the following values: Alias Name - watsonxdata Hostname \u2013 watsonxdata Port \u2013 9000 Access Key \u2013 $LH_S3_ACCESS_KEY Secret Key - $LH_S3_SECRET_KEY If you are using an external client to connect to the MinIO service, you will need the URL and Port number from the TechZone reservation. The access key and secret key will be the same values that are found above. Hostname \u2013 region.techzone-server.com Port \u2013 12345 The alias for local access is found below. mc alias set watsonxdata http://watsonxdata:9000 $LH_S3_ACCESS_KEY $LH_S3_SECRET_KEY Added `watsonxdata` successfully.","title":"Minio System Alias"},{"location":"wxd-minio/#list-buckets","text":"The mc command provides us with a number of commands that allows us to manage buckets and files within them. The following command checks to see what buckets currently exist in the system. mc ls tree watsonxdata [2023-09-29 14:38:19 EDT] 0B hive-bucket/ [2023-09-29 14:38:19 EDT] 0B iceberg-bucket/ You can view the contents of a bucket by traversing down the path. mc ls tree watsonxdata/hive-bucket [2023-10-13 10:34:36 EDT] 0B gosalesdw/ [2023-10-13 10:34:36 EDT] 0B hive_sql/ [2023-10-13 10:34:36 EDT] 0B ontime/ [2023-10-13 10:34:36 EDT] 0B taxi/","title":"List Buckets"},{"location":"wxd-minio/#create-a-bucket","text":"At this point we will create a new bucket to hold some data. Use the mb (make bucket) command. The command requires the alias name for the MinIO connection followed by the name of the bucket. mc mb alias-name/new-bucket The following code will create a new bucket in the system called sampledata . mc mb watsonxdata/sampledata Bucket created successfully `watsonxdata/sampledata`. We can double check that the bucket it there. mc ls tree watsonxdata [2023-09-29 14:38:19 EDT] 0B hive-bucket/ [2023-09-29 14:38:19 EDT] 0B iceberg-bucket/ [2023-10-13 10:39:47 EDT] 0B sampledata/","title":"Create a Bucket"},{"location":"wxd-minio/#loading-data","text":"One of the most powerful features of the MinIO CLI is its ability to load data directory from your workstation into the bucket, rather than having to use the MinIO UI. It is also significantly faster than using the UI interface. The next example will load data into the bucket that was just created. The directory that we will be using to load data from is called /sampledata and found in the root directory of the watsonx.data server. ls /sampledata/csv gosales ontime taxi Next we will load the data from each one of these directories into the sampledata bucket. The mc command allows you to select which files to place into a bucket, or an entire directory with recursion. In this case we are loading all three directories the files into the bucket. Note the use of the / at the end of the directory name to prevent the directory name csv from being used as the high-level directory name in the target bucket. mc cp --recursive /sampledata/csv/ watsonxdata/sampledata/ ...data/csv/taxi/taxi.csv: 306.16 MiB / 306.16 MiB \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 147.91 MiB/s 2s We can double-check that our files are there with the --files option: mc tree --files watsonxdata/sampledata/ watsonxdata/sampledata/ \u251c\u2500 gosales \u2502 \u251c\u2500 DIST_INVENTORY_FACT.csv \u2502 \u251c\u2500 DIST_PRODUCT_FORECAST_FACT.csv \u2502 \u251c\u2500 DIST_RETURNED_ITEMS_FACT.csv \u2502 \u251c\u2500 DIST_RETURN_REASON_DIM.csv .... \u2502 \u251c\u2500 EMP_EMPLOYEE_DIM.csv \u2502 \u251c\u2500 SLS_SALES_TARG_FACT.csv \u2502 \u251c\u2500 gosales_createtable.sql \u2502 \u2514\u2500 gosales_load_postgres.sql \u251c\u2500 ontime \u2502 \u251c\u2500 aircraft.csv \u2502 \u251c\u2500 airline_id.csv \u2502 \u251c\u2500 airport_id.csv \u2502 \u251c\u2500 cancellation.csv \u2502 \u2514\u2500 ontime.csv \u2514\u2500 taxi \u2514\u2500 taxi.csv","title":"Loading Data"},{"location":"wxd-minio/#delete-a-file-or-bucket","text":"Use the rb (Remove bucket) command to remove a bucket and its contents. You can remove individual objects by using the rm (Remove) command by fully qualifying the object. The next command will remove the ontime.csv file from the ontime folder. mc rm watsonxdata/sampledata/ontime/ontime.csv Removed `watsonxdata/sampledata/ontime/ontime.csv`. The delete bucket command will fail if you still have data in the bucket. mc rb watsonxdata/sampledata mc: `watsonxdata/sampledata` is not empty. Retry this command with \u2018--force\u2019 flag if you want to remove `watsonxdata/sampledata` and all its contents Adding the --force option will remove the bucket and all the data in it. Use with caution! mc rb --force watsonxdata/sampledata Removed `watsonxdata/sampledata` successfully.","title":"Delete a File or Bucket"},{"location":"wxd-objectstore/","text":"Working with Object Store Buckets In this lab, we will run through some exercises to understand how the watsonx.data can be configured to work with multiple buckets, using IBM COS, in addition to the out of the box MinIO bucket. In the GA version, there will be a user experience to facilitate such setup, however this lab will help you understand some service-service interactions & configurations. Why do we need to do this? In this lab, we will use multiple buckets as this is also how we can illustrate compute-storage separation. Out of the box, both in SaaS and Software, a tiny Object Store bucket is allocated, primarily for getting started use cases. Customers would need to point to their own bucket for their data. The use of a remote bucket (in this example, MinIO) also showcases the \"open\" aspect of the watsonx.data system. Customers own their data and can physically access the iceberg-ed bucket using other applications or engines, even custom ones that they build themselves. Customers would also have requirements to place (data sovereignty) buckets in specific locations. Compute/analytics engines may need to run in different locations, say closer to applications and connect to buckets in other networks/geos. There will also be situations where the same engine federates data across multiple buckets (and other database connections). As part of the GA release, there will also be authorization & data access rules that will control which user/group can access buckets even within the same engine. In Enterprise/Production environments, engines are expected to be ephemeral or there can be multiple engines. These engines when they come up will connect to different object store buckets. The list of engines will include Db2, NZ, IBM Analytics Engine for Spark, apart from Presto. The shared meta-store is critical in all of this as it helps provide relevant schema information to the engines. Create new bucket in MinIO Open your browser and navigate to the MinIO console. Check to see if the MinIO credentials exist in your terminal session. printf \"\\nAccess Key: $LH_S3_ACCESS_KEY \\nSecret Key: $LH_S3_SECRET_KEY\\n\" Userid : fcf1ec270e05a5031ca27bc9 Password: a671febd9e1e3826cf8cdcf5 If these values are blank, you need to run the following command. export LH_S3_ACCESS_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_ACCESS_KEY | sed 's/.*=//') export LH_S3_SECRET_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_SECRET_KEY | sed 's/.*=//') Click on the Buckets tab to show the current buckets in the MinIO system. You can see that we have two buckets used for the labs. We need to create a new bucket to use for our schema. Press the \"Create Bucket +\" option on the right side of the screen. Note : The size and contents of the existing buckets will be different on your system. Enter a bucket name (customer) and then press Create Bucket. You should now see your new bucket below. Open your browser and connect to the watsonx.data UI: Navigate to the Infrastructure manager by clicking on the icon below the Home symbol. Get the S3 bucket credentials. printf \"\\nAccess Key: $LH_S3_ACCESS_KEY \\nSecret Key: $LH_S3_SECRET_KEY\\n\" Click on the Add component menu and select Add bucket. Fill in the dialog with the following values. Bucket type \u2013 MinIO Bucket name \u2013 customer Display name \u2013 customer Endpoint \u2013 http://ibm-lh-minio-svc:9000 Access key \u2013 $LH_S3_ACCESS_KEY (contents of this value) Secret key \u2013 $LH_S3_SECRET_KEY (contents of this value) Activate now \u2013 Yes Catalog type - Apache Iceberg Catalog name - customer When done press Add and Activate now. Your UI should change to display the new bucket (Your screen may be slightly different). Note : This step may take a minute to complete. At this point you need to Associate the bucket with the Presto engine. When you hover your mouse over the Customer catalog and the Associate icon will display. If you do not see the Associate icon, refresh the browser page. Press the associate button and the following dialog will display. Select the presto-01 engine and then press the Save and restart engine button. Associate button and wait for the screen to refresh. Note : Your display will be different. Exploring the Customer bucket First check to make sure that the Presto engine has finished starting. While the watsonx.data UI has restarted the Presto process, it takes a few seconds to become available. check_presto Switch to the bin directory as the root user. sudo su - cd /root/ibm-lh-dev/bin Connect to Presto using the new customer catalog. ./presto-cli --catalog customer We will create a schema where we store our table data using the new catalog name we created for the customer bucket. CREATE SCHEMA IF NOT EXISTS newworkshop with (location='s3a://customer/'); Switch to the new schema. use newworkshop; Use the following SQL to create a new table in the customer bucket. create table customer as select * from tpch.tiny.customer; CREATE TABLE: 1500 rows Quit Presto. quit; You can use the Developer sandbox (bin/dev-sandbox.sh), as described in MinIO UI , to inspect the Customer bucket with the s3-inspect utility. It's easier to use the MinIO console to view the bucket instead. Open your browser and navigate to the MinIO console. From the main screen select Object Browser and view the contents of the customer bucket. Note : You can continue to add new buckets when working with the watsonx.data UI. However, if you delete the catalog or bucket in the UI, you may find that you may not be able to re-catalog it. If you find that this happens, create another bucket, or rename the original one if that is possible.","title":"Working with Object Store Buckets"},{"location":"wxd-objectstore/#working-with-object-store-buckets","text":"In this lab, we will run through some exercises to understand how the watsonx.data can be configured to work with multiple buckets, using IBM COS, in addition to the out of the box MinIO bucket. In the GA version, there will be a user experience to facilitate such setup, however this lab will help you understand some service-service interactions & configurations.","title":"Working with Object Store Buckets"},{"location":"wxd-objectstore/#why-do-we-need-to-do-this","text":"In this lab, we will use multiple buckets as this is also how we can illustrate compute-storage separation. Out of the box, both in SaaS and Software, a tiny Object Store bucket is allocated, primarily for getting started use cases. Customers would need to point to their own bucket for their data. The use of a remote bucket (in this example, MinIO) also showcases the \"open\" aspect of the watsonx.data system. Customers own their data and can physically access the iceberg-ed bucket using other applications or engines, even custom ones that they build themselves. Customers would also have requirements to place (data sovereignty) buckets in specific locations. Compute/analytics engines may need to run in different locations, say closer to applications and connect to buckets in other networks/geos. There will also be situations where the same engine federates data across multiple buckets (and other database connections). As part of the GA release, there will also be authorization & data access rules that will control which user/group can access buckets even within the same engine. In Enterprise/Production environments, engines are expected to be ephemeral or there can be multiple engines. These engines when they come up will connect to different object store buckets. The list of engines will include Db2, NZ, IBM Analytics Engine for Spark, apart from Presto. The shared meta-store is critical in all of this as it helps provide relevant schema information to the engines.","title":"Why do we need to do this?"},{"location":"wxd-objectstore/#create-new-bucket-in-minio","text":"Open your browser and navigate to the MinIO console. Check to see if the MinIO credentials exist in your terminal session. printf \"\\nAccess Key: $LH_S3_ACCESS_KEY \\nSecret Key: $LH_S3_SECRET_KEY\\n\" Userid : fcf1ec270e05a5031ca27bc9 Password: a671febd9e1e3826cf8cdcf5 If these values are blank, you need to run the following command. export LH_S3_ACCESS_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_ACCESS_KEY | sed 's/.*=//') export LH_S3_SECRET_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_SECRET_KEY | sed 's/.*=//') Click on the Buckets tab to show the current buckets in the MinIO system. You can see that we have two buckets used for the labs. We need to create a new bucket to use for our schema. Press the \"Create Bucket +\" option on the right side of the screen. Note : The size and contents of the existing buckets will be different on your system. Enter a bucket name (customer) and then press Create Bucket. You should now see your new bucket below. Open your browser and connect to the watsonx.data UI: Navigate to the Infrastructure manager by clicking on the icon below the Home symbol. Get the S3 bucket credentials. printf \"\\nAccess Key: $LH_S3_ACCESS_KEY \\nSecret Key: $LH_S3_SECRET_KEY\\n\" Click on the Add component menu and select Add bucket. Fill in the dialog with the following values. Bucket type \u2013 MinIO Bucket name \u2013 customer Display name \u2013 customer Endpoint \u2013 http://ibm-lh-minio-svc:9000 Access key \u2013 $LH_S3_ACCESS_KEY (contents of this value) Secret key \u2013 $LH_S3_SECRET_KEY (contents of this value) Activate now \u2013 Yes Catalog type - Apache Iceberg Catalog name - customer When done press Add and Activate now. Your UI should change to display the new bucket (Your screen may be slightly different). Note : This step may take a minute to complete. At this point you need to Associate the bucket with the Presto engine. When you hover your mouse over the Customer catalog and the Associate icon will display. If you do not see the Associate icon, refresh the browser page. Press the associate button and the following dialog will display. Select the presto-01 engine and then press the Save and restart engine button. Associate button and wait for the screen to refresh. Note : Your display will be different.","title":"Create new bucket in MinIO"},{"location":"wxd-objectstore/#exploring-the-customer-bucket","text":"First check to make sure that the Presto engine has finished starting. While the watsonx.data UI has restarted the Presto process, it takes a few seconds to become available. check_presto Switch to the bin directory as the root user. sudo su - cd /root/ibm-lh-dev/bin Connect to Presto using the new customer catalog. ./presto-cli --catalog customer We will create a schema where we store our table data using the new catalog name we created for the customer bucket. CREATE SCHEMA IF NOT EXISTS newworkshop with (location='s3a://customer/'); Switch to the new schema. use newworkshop; Use the following SQL to create a new table in the customer bucket. create table customer as select * from tpch.tiny.customer; CREATE TABLE: 1500 rows Quit Presto. quit; You can use the Developer sandbox (bin/dev-sandbox.sh), as described in MinIO UI , to inspect the Customer bucket with the s3-inspect utility. It's easier to use the MinIO console to view the bucket instead. Open your browser and navigate to the MinIO console. From the main screen select Object Browser and view the contents of the customer bucket. Note : You can continue to add new buckets when working with the watsonx.data UI. However, if you delete the catalog or bucket in the UI, you may find that you may not be able to re-catalog it. If you find that this happens, create another bucket, or rename the original one if that is possible.","title":"Exploring the Customer bucket"},{"location":"wxd-presto/","text":"Using the Presto console UI Your TechZone reservation will include the server name and port number to use when connecting to the Presto UI. The default port number is 8443 and the server will be referred to as region.techzone-server.com . Replace these values with those found in your reservation. Presto console - https://region.techzone-server.com:port Credentials: username: ibmlhadmin password: password The Presto console allows you to do the following: Monitor state of the cluster Queries being executed Queries in queue Data throughput Query details (text and plan) Note : The Presto console is very valuable when it comes to diagnosing problems with any queries you run in the watsonx.data environment. If a query fails you can find more details in the Presto console using the instructions below. On the main Presto screen, click the Finished Button (middle of the screen). A list of finished queries will display below the tab bar. You can scroll through the list of queries and get details of the execution plans. If you scroll through the list, you should see the test query \"select * from customer limit 5\". If you had a query that failed, look for the SQL in this list and continue on with the next step. Click on the query ID to see details of the execution plan that Presto produced. You can get more information about the query by clicking on any of the tabs that are on this screen. For instance, the Live Plan tab will show a visual explain of the stages that the query went through during execution. Scrolling to the bottom of this screen will also display any error messages that may have been produced by the SQL. Take time to check out the other information that is available for the query including the stage performance.","title":"Presto UI"},{"location":"wxd-presto/#using-the-presto-console-ui","text":"Your TechZone reservation will include the server name and port number to use when connecting to the Presto UI. The default port number is 8443 and the server will be referred to as region.techzone-server.com . Replace these values with those found in your reservation. Presto console - https://region.techzone-server.com:port Credentials: username: ibmlhadmin password: password The Presto console allows you to do the following: Monitor state of the cluster Queries being executed Queries in queue Data throughput Query details (text and plan) Note : The Presto console is very valuable when it comes to diagnosing problems with any queries you run in the watsonx.data environment. If a query fails you can find more details in the Presto console using the instructions below. On the main Presto screen, click the Finished Button (middle of the screen). A list of finished queries will display below the tab bar. You can scroll through the list of queries and get details of the execution plans. If you scroll through the list, you should see the test query \"select * from customer limit 5\". If you had a query that failed, look for the SQL in this list and continue on with the next step. Click on the query ID to see details of the execution plan that Presto produced. You can get more information about the query by clicking on any of the tabs that are on this screen. For instance, the Live Plan tab will show a visual explain of the stages that the query went through during execution. Scrolling to the bottom of this screen will also display any error messages that may have been produced by the SQL. Take time to check out the other information that is available for the query including the stage performance.","title":"Using the Presto console UI"},{"location":"wxd-prestocli/","text":"Watsonx.data Introduction Watsonx.data is based on open source PrestoDB, a distributed query engine that enables querying data stored in open file formats using open table formats for optimization or performance. Some of the characteristics which you will learn and see in action include: Compute processing is performed in memory and in parallel. Data is pipelined between query stages and over the network reducing latency overhead that one would have if disk I/O were involved. All the below tasks will be done using the Developer edition of watsonx.data. Using watsonx.data Connectivity to watsonx.data can be done using the following methods: Command line interface (CLI) JDBC drivers watsonx.data UI Connecting to watsonx.data and executing queries using CLI Open the watsonx.data CLI using the development directory. Make sure you are the root user. whoami If not, switch to the root user. sudo su - Change to the development directory. cd /root/ibm-lh-dev/bin Start the Presto CLI. ./presto-cli We are going to inspect the available catalogs in the watsonx.data system. A watsonx.data catalog contains schemas and references a data source via a connector. A connector is like a driver for a database. Watsonx.data connectors are an implementation of Presto\u2019s SPI which allows Presto to interact with a resource. There are several built-in connectors for JMX, Hive, TPCH etc., some of which you will use as part of the labs. Display the catalogs. show catalogs; Catalog --------------- hive_data iceberg_data jmx system tpcds tpch (6 rows) Let's look up what schemas are available with any given catalog. We will use the TPCH catalog which is an internal PrestoDB auto-generated catalog and look at the available schemas. show schemas in tpch; Schema -------------------- information_schema sf1 sf100 sf1000 sf10000 sf100000 sf300 sf3000 sf30000 tiny (10 rows) Quit the presto-cli interface by executing the \"quit;\" command. quit; You can connect to a specific catalog and schema and look at the tables etc. ./presto-cli --catalog tpch --schema tiny presto:tiny> You will notice that the Presto prompt includes the name of the schema we are currently connected to. Look at the available tables in the TPCH catalog under the tiny schema. show tables; Table ---------- customer lineitem nation orders part partsupp region supplier (8 rows) Inspect schema of the customer table. describe customer; Column | Type | Extra | Comment ------------+--------------+-------+--------- custkey | bigint | | name | varchar(25) | | address | varchar(40) | | nationkey | bigint | | phone | varchar(15) | | acctbal | double | | mktsegment | varchar(10) | | comment | varchar(117) | | (8 rows) You could also use the syntax below to achieve the same result. show columns from customer; Column | Type | Extra | Comment -----------+--------------+-------+--------- custkey | bigint | | name | varchar(25) | | address | varchar(40) | | nationkey | bigint | | phone | varchar(15) | | acctbal | double | | mktsegment | varchar(10) | | comment | varchar(117) | | (8 rows) Inspect available functions. show functions like 'date%'; Function | Return Type | Argument Types | Function Type | Deterministic | Description | Variable Arity | Built In | Temporary | Language -------------+--------------------------+----------------------------------------------------------------+---------------+---------------+-------------------------------------------------------------+----------------+----------+-----------+---------- date | date | timestamp | scalar | true | | false | true | false | date | date | timestamp with time zone | scalar | true | | false | true | false | date | date | varchar(x) | scalar | true | | false | true | false | date_add | date | varchar(x), bigint, date | scalar | true | add the specified amount of date to the given date | false | true | false | date_add | time | varchar(x), bigint, time | scalar | true | add the specified amount of time to the given time | false | true | false | date_add | time with time zone | varchar(x), bigint, time with time zone | scalar | true | add the specified amount of time to the given time | false | true | false | date_add | timestamp | varchar(x), bigint, timestamp | scalar | true | add the specified amount of time to the given timestamp | false | true | false | date_add | timestamp with time zone | varchar(x), bigint, timestamp with time zone | scalar | true | add the specified amount of time to the given timestamp | false | true | false | date_diff | bigint | varchar(x), date, date | scalar | true | difference of the given dates in the given unit | false | true | false | date_diff | bigint | varchar(x), time with time zone, time with time zone | scalar | true | difference of the given times in the given unit | false | true | false | date_diff | bigint | varchar(x), time, time | scalar | true | difference of the given times in the given unit | false | true | false | date_diff | bigint | varchar(x), timestamp with time zone, timestamp with time zone | scalar | true | difference of the given times in the given unit | false | true | false | date_diff | bigint | varchar(x), timestamp, timestamp | scalar | true | difference of the given times in the given unit | false | true | false | date_format | varchar | timestamp with time zone, varchar(x) | scalar | true | | false | true | false | date_format | varchar | timestamp, varchar(x) | scalar | true | | false | true | false | date_parse | timestamp | varchar(x), varchar(y) | scalar | true | | false | true | false | date_trunc | date | varchar(x), date | scalar | true | truncate to the specified precision in the session timezone | false | true | false | date_trunc | time | varchar(x), time | scalar | true | truncate to the specified precision in the session timezone | false | true | false | date_trunc | time with time zone | varchar(x), time with time zone | scalar | true | truncate to the specified precision | false | true | false | date_trunc | timestamp | varchar(x), timestamp | scalar | true | truncate to the specified precision in the session timezone | false | true | false | date_trunc | timestamp with time zone | varchar(x), timestamp with time zone | scalar | true | truncate to the specified precision | false | true | false | (21 rows) Switch to a different schema. use sf1; Display the Tables in the schema. show tables; Table ---------- customer lineitem nation orders part partsupp region supplier (8 rows) Query data from customer table. select * from customer limit 5; custkey | name | address | nationkey | phone | acctbal | mktsegment | comment ---------+--------------------+------------------------------------------+-----------+-----------------+---------+------------+------------------------------------------------------------------------------------------------------- 37501 | Customer#000037501 | Ftb6T5ImHuJ | 2 | 12-397-688-6719 | -324.85 | HOUSEHOLD | pending ideas use carefully. express, ironic platelets use among the furiously regular instructions. 37502 | Customer#000037502 | ppCVXCFV,4JJ97IibbcMB5,aPByjYL07vmOLO 3m | 18 | 28-515-931-4624 | 5179.2 | BUILDING | express deposits. pending, regular deposits wake furiously bold deposits. regular 37503 | Customer#000037503 | Cg60cN3LGIUpLpXn0vRffQl8 | 13 | 23-977-571-7365 | 1862.32 | BUILDING | ular deposits. furiously ironic deposits integrate carefully among the iron 37504 | Customer#000037504 | E1 IiMlCfW7I4 1b9wfDZR | 21 | 31-460-590-3623 | 2955.33 | HOUSEHOLD | s believe slyly final foxes. furiously e 37505 | Customer#000037505 | Ad,XVdA6XAa0h aukZHUo5Mxh,ZRwVR3k7b7 | 3 | 13-521-760-7263 | 3243.15 | FURNITURE | ites according to the quickly bold instru (5 rows) Gather statistics on a given table. show stats for customer; column_name | data_size | distinct_values_count | nulls_fraction | row_count | low_value | high_value -------------+-------------+-----------------------+----------------+-----------+-----------+------------ custkey | NULL | 150039.0 | 0.0 | NULL | 1 | 150000 name | 2700000.0 | 149980.0 | 0.0 | NULL | NULL | NULL address | 3758056.0 | 150043.0 | 0.0 | NULL | NULL | NULL nationkey | NULL | 25.0 | 0.0 | NULL | 0 | 24 phone | 2250000.0 | 150018.0 | 0.0 | NULL | NULL | NULL acctbal | NULL | 140166.0 | 0.0 | NULL | -999.99 | 9999.99 mktsegment | 1349610.0 | 5.0 | 0.0 | NULL | NULL | NULL comment | 1.0876099E7 | 149987.0 | 0.0 | NULL | NULL | NULL NULL | NULL | NULL | NULL | 150000.0 | NULL | NULL (9 rows) Quit Presto. quit;","title":"Presto CLI"},{"location":"wxd-prestocli/#watsonxdata-introduction","text":"Watsonx.data is based on open source PrestoDB, a distributed query engine that enables querying data stored in open file formats using open table formats for optimization or performance. Some of the characteristics which you will learn and see in action include: Compute processing is performed in memory and in parallel. Data is pipelined between query stages and over the network reducing latency overhead that one would have if disk I/O were involved. All the below tasks will be done using the Developer edition of watsonx.data.","title":"Watsonx.data Introduction"},{"location":"wxd-prestocli/#using-watsonxdata","text":"Connectivity to watsonx.data can be done using the following methods: Command line interface (CLI) JDBC drivers watsonx.data UI","title":"Using watsonx.data"},{"location":"wxd-prestocli/#connecting-to-watsonxdata-and-executing-queries-using-cli","text":"Open the watsonx.data CLI using the development directory. Make sure you are the root user. whoami If not, switch to the root user. sudo su - Change to the development directory. cd /root/ibm-lh-dev/bin Start the Presto CLI. ./presto-cli We are going to inspect the available catalogs in the watsonx.data system. A watsonx.data catalog contains schemas and references a data source via a connector. A connector is like a driver for a database. Watsonx.data connectors are an implementation of Presto\u2019s SPI which allows Presto to interact with a resource. There are several built-in connectors for JMX, Hive, TPCH etc., some of which you will use as part of the labs. Display the catalogs. show catalogs; Catalog --------------- hive_data iceberg_data jmx system tpcds tpch (6 rows) Let's look up what schemas are available with any given catalog. We will use the TPCH catalog which is an internal PrestoDB auto-generated catalog and look at the available schemas. show schemas in tpch; Schema -------------------- information_schema sf1 sf100 sf1000 sf10000 sf100000 sf300 sf3000 sf30000 tiny (10 rows) Quit the presto-cli interface by executing the \"quit;\" command. quit; You can connect to a specific catalog and schema and look at the tables etc. ./presto-cli --catalog tpch --schema tiny presto:tiny> You will notice that the Presto prompt includes the name of the schema we are currently connected to. Look at the available tables in the TPCH catalog under the tiny schema. show tables; Table ---------- customer lineitem nation orders part partsupp region supplier (8 rows) Inspect schema of the customer table. describe customer; Column | Type | Extra | Comment ------------+--------------+-------+--------- custkey | bigint | | name | varchar(25) | | address | varchar(40) | | nationkey | bigint | | phone | varchar(15) | | acctbal | double | | mktsegment | varchar(10) | | comment | varchar(117) | | (8 rows) You could also use the syntax below to achieve the same result. show columns from customer; Column | Type | Extra | Comment -----------+--------------+-------+--------- custkey | bigint | | name | varchar(25) | | address | varchar(40) | | nationkey | bigint | | phone | varchar(15) | | acctbal | double | | mktsegment | varchar(10) | | comment | varchar(117) | | (8 rows) Inspect available functions. show functions like 'date%'; Function | Return Type | Argument Types | Function Type | Deterministic | Description | Variable Arity | Built In | Temporary | Language -------------+--------------------------+----------------------------------------------------------------+---------------+---------------+-------------------------------------------------------------+----------------+----------+-----------+---------- date | date | timestamp | scalar | true | | false | true | false | date | date | timestamp with time zone | scalar | true | | false | true | false | date | date | varchar(x) | scalar | true | | false | true | false | date_add | date | varchar(x), bigint, date | scalar | true | add the specified amount of date to the given date | false | true | false | date_add | time | varchar(x), bigint, time | scalar | true | add the specified amount of time to the given time | false | true | false | date_add | time with time zone | varchar(x), bigint, time with time zone | scalar | true | add the specified amount of time to the given time | false | true | false | date_add | timestamp | varchar(x), bigint, timestamp | scalar | true | add the specified amount of time to the given timestamp | false | true | false | date_add | timestamp with time zone | varchar(x), bigint, timestamp with time zone | scalar | true | add the specified amount of time to the given timestamp | false | true | false | date_diff | bigint | varchar(x), date, date | scalar | true | difference of the given dates in the given unit | false | true | false | date_diff | bigint | varchar(x), time with time zone, time with time zone | scalar | true | difference of the given times in the given unit | false | true | false | date_diff | bigint | varchar(x), time, time | scalar | true | difference of the given times in the given unit | false | true | false | date_diff | bigint | varchar(x), timestamp with time zone, timestamp with time zone | scalar | true | difference of the given times in the given unit | false | true | false | date_diff | bigint | varchar(x), timestamp, timestamp | scalar | true | difference of the given times in the given unit | false | true | false | date_format | varchar | timestamp with time zone, varchar(x) | scalar | true | | false | true | false | date_format | varchar | timestamp, varchar(x) | scalar | true | | false | true | false | date_parse | timestamp | varchar(x), varchar(y) | scalar | true | | false | true | false | date_trunc | date | varchar(x), date | scalar | true | truncate to the specified precision in the session timezone | false | true | false | date_trunc | time | varchar(x), time | scalar | true | truncate to the specified precision in the session timezone | false | true | false | date_trunc | time with time zone | varchar(x), time with time zone | scalar | true | truncate to the specified precision | false | true | false | date_trunc | timestamp | varchar(x), timestamp | scalar | true | truncate to the specified precision in the session timezone | false | true | false | date_trunc | timestamp with time zone | varchar(x), timestamp with time zone | scalar | true | truncate to the specified precision | false | true | false | (21 rows) Switch to a different schema. use sf1; Display the Tables in the schema. show tables; Table ---------- customer lineitem nation orders part partsupp region supplier (8 rows) Query data from customer table. select * from customer limit 5; custkey | name | address | nationkey | phone | acctbal | mktsegment | comment ---------+--------------------+------------------------------------------+-----------+-----------------+---------+------------+------------------------------------------------------------------------------------------------------- 37501 | Customer#000037501 | Ftb6T5ImHuJ | 2 | 12-397-688-6719 | -324.85 | HOUSEHOLD | pending ideas use carefully. express, ironic platelets use among the furiously regular instructions. 37502 | Customer#000037502 | ppCVXCFV,4JJ97IibbcMB5,aPByjYL07vmOLO 3m | 18 | 28-515-931-4624 | 5179.2 | BUILDING | express deposits. pending, regular deposits wake furiously bold deposits. regular 37503 | Customer#000037503 | Cg60cN3LGIUpLpXn0vRffQl8 | 13 | 23-977-571-7365 | 1862.32 | BUILDING | ular deposits. furiously ironic deposits integrate carefully among the iron 37504 | Customer#000037504 | E1 IiMlCfW7I4 1b9wfDZR | 21 | 31-460-590-3623 | 2955.33 | HOUSEHOLD | s believe slyly final foxes. furiously e 37505 | Customer#000037505 | Ad,XVdA6XAa0h aukZHUo5Mxh,ZRwVR3k7b7 | 3 | 13-521-760-7263 | 3243.15 | FURNITURE | ites according to the quickly bold instru (5 rows) Gather statistics on a given table. show stats for customer; column_name | data_size | distinct_values_count | nulls_fraction | row_count | low_value | high_value -------------+-------------+-----------------------+----------------+-----------+-----------+------------ custkey | NULL | 150039.0 | 0.0 | NULL | 1 | 150000 name | 2700000.0 | 149980.0 | 0.0 | NULL | NULL | NULL address | 3758056.0 | 150043.0 | 0.0 | NULL | NULL | NULL nationkey | NULL | 25.0 | 0.0 | NULL | 0 | 24 phone | 2250000.0 | 150018.0 | 0.0 | NULL | NULL | NULL acctbal | NULL | 140166.0 | 0.0 | NULL | -999.99 | 9999.99 mktsegment | 1349610.0 | 5.0 | 0.0 | NULL | NULL | NULL comment | 1.0876099E7 | 149987.0 | 0.0 | NULL | NULL | NULL NULL | NULL | NULL | NULL | 150000.0 | NULL | NULL (9 rows) Quit Presto. quit;","title":"Connecting to watsonx.data and executing queries using CLI"},{"location":"wxd-quick/","text":"Quick Start The following sections describe how to get started quickly with the watsonx.data developer system. If you are not familiar with the tools mentioned below, select the details link for more instructions. Requesting an IBM userid Requesting a TechZone image Accessing the Image SSH Access Open Ports Passwords Portainer Console Documentation IBM Userid An IBMid is needed to access IBM Technology Zone. If you do not have an IBMid, click on the following link and request a new IBMid. https://techzone.ibm.com More details: Creating an IBM Userid Requesting a TechZone image Log into TechZone ( https://techzone.ibm.com ) and search for the watsonx.data Developer Base Image or use the following link. https://techzone.ibm.com/collection/ibm-watsonxdata-developer-base-image Problem with reservations failing? Check the TechZone status page at https://techzone.status.io . More details: Reserving a TechZone image Accessing the Image The email from TechZone indicating that the image is ready will contain a link to your reservations. Click on the link and search for the watsonx.data reservation. More details: Accessing a TechZone image SSH Access Your TechZone reservation will include the server name and port number to use when connecting using ssh. The port number is referred to as port in the command below, while the server will be referred to as region.techzone-server.com . Replace these values with those found in your reservation. Open a terminal window and use the following syntax to connect as the watsonx userid. ssh -p port watsonx@region.techzone-server.com The port number and server name are provided as part of the TechZone reservation details. To become the root user, issue the following command. sudo su - Password for both users is watsonx.data . You can copy files into and out of the server using the following syntax: scp -P port myfile.txt watsonx@region.techzone-server.com:/tmp/myfile.txt scp -P port watsonx@region.techzone-server.com:/tmp/myfile.txt myfile.txt More details: SSH Access Open Ports The following URLs and Ports are used to access the watsonx.data services. Most browsers will work with these URLs. However, Mac OSX users should be aware that accessing the MinIO console may occasionally cause Firefox and Chrome to lock up. If you find that this occurs, try using Safari which appears to work fine. The ports that are used in the lab are listed below. Note that the internal port number is always the same when running in the VMware image using the VM Remote Console. When using your workstation's browser, you will need to use the server name and port number supplied in the TechZone reservation. Service Port Active watsonx.data management console 9443 Yes Presto console 8443 Yes MinIO console (S3 buckets) 9001 Yes MinIO S3 Endpoint 9000 Yes Portainer (Docker container management) 6443 Yes Apache Superset (Query and Graphing) 8088 No Jupyter Notebook 8888 Yes Presto External Port 8443 Yes Hive metadata Port 9043 Yes MySQL External Port 3306 Yes Postgres External Port 5432 Yes Db2 Database Port 50000 Yes VNC Port 5901 No Note : The following ports are not active unless the service is started: Apache Superset (8088) VNC Terminal Display (5901) More details: Open Ports Passwords This table lists the passwords for the services that have \"fixed\" userids and passwords. Service Userid Password Virtual Machine watsonx watsonx.data Virtual Machine root watsonx.data watsonx.data UI ibmlhadmin password Jupyter Notebook none watsonx.data Presto ibmlhadmin password Minio Generated Generated Postgres admin Generated Apache Superset admin admin Portainer admin watsonx.data Db2 db2inst1 db2inst1 MySQL root password VNC Windows none watsonx. VNC OSX none watsonx.data Use the following commands to get the generated userid and password for MinIO. export LH_S3_ACCESS_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_ACCESS_KEY | sed \"s/.*=//\") export LH_S3_SECRET_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_SECRET_KEY | sed 's/.*=//') echo \"MinIO Userid : \" $LH_S3_ACCESS_KEY echo \"MinIO Password: \" $LH_S3_SECRET_KEY Use the following command to get the password for Postgres. export POSTGRES_PASSWORD=$(docker exec ibm-lh-postgres printenv | grep POSTGRES_PASSWORD | sed 's/.*=//') echo \"Postgres Userid : admin\" echo \"Postgres Password : \" $POSTGRES_PASSWORD You can get all passwords for the system when you are logged by issuing the following command: cat /certs/passwords If the passwords do not appear to work, you may need to regenerate them. The following must be run as the root user. sudo su - passwords The passwords command will refresh the passwords and also display them. If this command is not run as root, an error message will be displayed because the password file cannot be updated as the watsonx user. More details: Passwords Portainer This lab system has Portainer installed. Portainer provides an administrative interface to the Docker images that are running on this system. You can use this console to check that all the containers are running and see what resources they are using. Open your TechZone reservation and select the Portainer link to connect to it. Credentials: userid: admin password: watsonx.data More details: Portainer Documentation The following links provide more information on the components in this lab. watsonx.data - https://www.ibm.com/docs/en/watsonxdata/1.0.x Presto SQL - https://prestodb.io/docs/current/sql.html Presto Console - https://prestodb.io/docs/current/admin/web-interface.html MinIO - https://min.io/docs/minio/linux/administration/minio-console.html Apache Superset - https://superset.apache.org/docs/creating-charts-dashboards/exploring-data dBeaver - https://dbeaver.com/docs/wiki/Application-Window-Overview/ Db2 SQL - https://www.ibm.com/docs/en/db2/11.5?topic=queries-select-statement PostgreSQL SQL - https://www.postgresql.org/docs/current/sql.html","title":"Quick Start"},{"location":"wxd-quick/#quick-start","text":"The following sections describe how to get started quickly with the watsonx.data developer system. If you are not familiar with the tools mentioned below, select the details link for more instructions. Requesting an IBM userid Requesting a TechZone image Accessing the Image SSH Access Open Ports Passwords Portainer Console Documentation","title":"Quick Start"},{"location":"wxd-quick/#ibm-userid","text":"An IBMid is needed to access IBM Technology Zone. If you do not have an IBMid, click on the following link and request a new IBMid. https://techzone.ibm.com More details: Creating an IBM Userid","title":"IBM Userid"},{"location":"wxd-quick/#requesting-a-techzone-image","text":"Log into TechZone ( https://techzone.ibm.com ) and search for the watsonx.data Developer Base Image or use the following link. https://techzone.ibm.com/collection/ibm-watsonxdata-developer-base-image Problem with reservations failing? Check the TechZone status page at https://techzone.status.io . More details: Reserving a TechZone image","title":"Requesting a TechZone image"},{"location":"wxd-quick/#accessing-the-image","text":"The email from TechZone indicating that the image is ready will contain a link to your reservations. Click on the link and search for the watsonx.data reservation. More details: Accessing a TechZone image","title":"Accessing the Image"},{"location":"wxd-quick/#ssh-access","text":"Your TechZone reservation will include the server name and port number to use when connecting using ssh. The port number is referred to as port in the command below, while the server will be referred to as region.techzone-server.com . Replace these values with those found in your reservation. Open a terminal window and use the following syntax to connect as the watsonx userid. ssh -p port watsonx@region.techzone-server.com The port number and server name are provided as part of the TechZone reservation details. To become the root user, issue the following command. sudo su - Password for both users is watsonx.data . You can copy files into and out of the server using the following syntax: scp -P port myfile.txt watsonx@region.techzone-server.com:/tmp/myfile.txt scp -P port watsonx@region.techzone-server.com:/tmp/myfile.txt myfile.txt More details: SSH Access","title":"SSH Access"},{"location":"wxd-quick/#open-ports","text":"The following URLs and Ports are used to access the watsonx.data services. Most browsers will work with these URLs. However, Mac OSX users should be aware that accessing the MinIO console may occasionally cause Firefox and Chrome to lock up. If you find that this occurs, try using Safari which appears to work fine. The ports that are used in the lab are listed below. Note that the internal port number is always the same when running in the VMware image using the VM Remote Console. When using your workstation's browser, you will need to use the server name and port number supplied in the TechZone reservation. Service Port Active watsonx.data management console 9443 Yes Presto console 8443 Yes MinIO console (S3 buckets) 9001 Yes MinIO S3 Endpoint 9000 Yes Portainer (Docker container management) 6443 Yes Apache Superset (Query and Graphing) 8088 No Jupyter Notebook 8888 Yes Presto External Port 8443 Yes Hive metadata Port 9043 Yes MySQL External Port 3306 Yes Postgres External Port 5432 Yes Db2 Database Port 50000 Yes VNC Port 5901 No Note : The following ports are not active unless the service is started: Apache Superset (8088) VNC Terminal Display (5901) More details: Open Ports","title":"Open Ports"},{"location":"wxd-quick/#passwords","text":"This table lists the passwords for the services that have \"fixed\" userids and passwords. Service Userid Password Virtual Machine watsonx watsonx.data Virtual Machine root watsonx.data watsonx.data UI ibmlhadmin password Jupyter Notebook none watsonx.data Presto ibmlhadmin password Minio Generated Generated Postgres admin Generated Apache Superset admin admin Portainer admin watsonx.data Db2 db2inst1 db2inst1 MySQL root password VNC Windows none watsonx. VNC OSX none watsonx.data Use the following commands to get the generated userid and password for MinIO. export LH_S3_ACCESS_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_ACCESS_KEY | sed \"s/.*=//\") export LH_S3_SECRET_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_SECRET_KEY | sed 's/.*=//') echo \"MinIO Userid : \" $LH_S3_ACCESS_KEY echo \"MinIO Password: \" $LH_S3_SECRET_KEY Use the following command to get the password for Postgres. export POSTGRES_PASSWORD=$(docker exec ibm-lh-postgres printenv | grep POSTGRES_PASSWORD | sed 's/.*=//') echo \"Postgres Userid : admin\" echo \"Postgres Password : \" $POSTGRES_PASSWORD You can get all passwords for the system when you are logged by issuing the following command: cat /certs/passwords If the passwords do not appear to work, you may need to regenerate them. The following must be run as the root user. sudo su - passwords The passwords command will refresh the passwords and also display them. If this command is not run as root, an error message will be displayed because the password file cannot be updated as the watsonx user. More details: Passwords","title":"Passwords"},{"location":"wxd-quick/#portainer","text":"This lab system has Portainer installed. Portainer provides an administrative interface to the Docker images that are running on this system. You can use this console to check that all the containers are running and see what resources they are using. Open your TechZone reservation and select the Portainer link to connect to it. Credentials: userid: admin password: watsonx.data More details: Portainer","title":"Portainer"},{"location":"wxd-quick/#documentation","text":"The following links provide more information on the components in this lab. watsonx.data - https://www.ibm.com/docs/en/watsonxdata/1.0.x Presto SQL - https://prestodb.io/docs/current/sql.html Presto Console - https://prestodb.io/docs/current/admin/web-interface.html MinIO - https://min.io/docs/minio/linux/administration/minio-console.html Apache Superset - https://superset.apache.org/docs/creating-charts-dashboards/exploring-data dBeaver - https://dbeaver.com/docs/wiki/Application-Window-Overview/ Db2 SQL - https://www.ibm.com/docs/en/db2/11.5?topic=queries-select-statement PostgreSQL SQL - https://www.postgresql.org/docs/current/sql.html","title":"Documentation"},{"location":"wxd-reference-access/","text":"Accessing the watsonx.data TechZone Image The reservation email from TechZone is extremely important since it provides a link to your reservation. Click on the View My Reservations to access your reservations. Click on the reservation that corresponds to the watsonx.data reservation. The menu button that is beside the arrow provides options to extend or delete the reservation. When you click on reservation details option, or the reservation box, the browser will display the details of your image. Scroll down to the bottom of the web page to access the VM Remote Console. You can access the logon screen of the virtual machine by pressing the VM Remote Console button. It is not necessary to use the VM console except unless you want to use the dBeaver program. Select the watsonx user and use watsonx.data as the password. Refer to the section on VM Remote Console for more details.","title":"Accessing the reservation"},{"location":"wxd-reference-access/#accessing-the-watsonxdata-techzone-image","text":"The reservation email from TechZone is extremely important since it provides a link to your reservation. Click on the View My Reservations to access your reservations. Click on the reservation that corresponds to the watsonx.data reservation. The menu button that is beside the arrow provides options to extend or delete the reservation. When you click on reservation details option, or the reservation box, the browser will display the details of your image. Scroll down to the bottom of the web page to access the VM Remote Console. You can access the logon screen of the virtual machine by pressing the VM Remote Console button. It is not necessary to use the VM console except unless you want to use the dBeaver program. Select the watsonx user and use watsonx.data as the password. Refer to the section on VM Remote Console for more details.","title":"Accessing the watsonx.data TechZone Image"},{"location":"wxd-reference-console/","text":"Using the VM Remote Console The watsonx server that has been provisioned has no physical monitor attached to it (headless is what it is commonly referred to) and so we need to use a different technique to view the desktop of the main user or the system (watsonx). The first thing to consider is whether you need to use the VM Remote Console at all. All the services like the watsonx.data UI, MinIO, Presto, Apache Superset and Portainer, are all web-based servers which means you just need to use your own browser to access these programs. Connecting into the watsonx virtual machine can be done using the secure shell command (ssh) which provides access to all the low-level commands you might need to use like starting the Apache Superset service. Note that Apache Superset is not up and running by default, so you will need to start it before attempting to connect to it. So what's the VM Remote Console required for? One program that has been provided to view the database schemas is dBeaver, a community edition of software that provides a query interface to 100's of data sources, including the watsonx.data environment. You can only use this program using the VM Remote Console. You do have the option of installing this software on your own machine if you wish. Find your email message that contains details of your reservation. Details of what the reservations and the page containing details of the reservation can be found in the Accessing the reservation or Accessing a workshop section. Once the details appear, scroll down to the bottom of the web page, and you will see the VM Remote Console button. You can access the logon screen of the virtual machine by pressing the VM Remote Console button. Clicking on this button will display the logon screen for the server. Select the watsonx user and use watsonx.data as the password. You can open this window in a separate browser window, or place it into full-screen mode. Note that you may need to increase the size of your browser window (or change the scaling in the browser) to see all the virtual desktop. At this point you have access to the desktop of the watsonx user and can issue commands from within this environment. As mentioned previously, you do not need to use this interface to use the lab.","title":"VM Remote Console"},{"location":"wxd-reference-console/#using-the-vm-remote-console","text":"The watsonx server that has been provisioned has no physical monitor attached to it (headless is what it is commonly referred to) and so we need to use a different technique to view the desktop of the main user or the system (watsonx). The first thing to consider is whether you need to use the VM Remote Console at all. All the services like the watsonx.data UI, MinIO, Presto, Apache Superset and Portainer, are all web-based servers which means you just need to use your own browser to access these programs. Connecting into the watsonx virtual machine can be done using the secure shell command (ssh) which provides access to all the low-level commands you might need to use like starting the Apache Superset service. Note that Apache Superset is not up and running by default, so you will need to start it before attempting to connect to it. So what's the VM Remote Console required for? One program that has been provided to view the database schemas is dBeaver, a community edition of software that provides a query interface to 100's of data sources, including the watsonx.data environment. You can only use this program using the VM Remote Console. You do have the option of installing this software on your own machine if you wish. Find your email message that contains details of your reservation. Details of what the reservations and the page containing details of the reservation can be found in the Accessing the reservation or Accessing a workshop section. Once the details appear, scroll down to the bottom of the web page, and you will see the VM Remote Console button. You can access the logon screen of the virtual machine by pressing the VM Remote Console button. Clicking on this button will display the logon screen for the server. Select the watsonx user and use watsonx.data as the password. You can open this window in a separate browser window, or place it into full-screen mode. Note that you may need to increase the size of your browser window (or change the scaling in the browser) to see all the virtual desktop. At this point you have access to the desktop of the watsonx user and can issue commands from within this environment. As mentioned previously, you do not need to use this interface to use the lab.","title":"Using the VM Remote Console"},{"location":"wxd-reference-documentation/","text":"Documentation The following links provide more information on the components in this lab. watsonx.data - https://www.ibm.com/docs/en/watsonxdata/1.0.x Presto SQL - https://prestodb.io/docs/current/sql.html Presto Console - https://prestodb.io/docs/current/admin/web-interface.html MinIO - https://min.io/docs/minio/linux/administration/minio-console.html MinIO CLI - https://min.io/docs/minio/linux/reference/minio-mc.html Apache Superset - https://superset.apache.org/docs/creating-charts-dashboards/exploring-data dBeaver - https://dbeaver.com/docs/wiki/Application-Window-Overview/ Db2 SQL - https://www.ibm.com/docs/en/db2/11.5?topic=queries-select-statement PostgreSQL SQL - https://www.postgresql.org/docs/current/sql.html MySQL SQL - https://dev.mysql.com/doc/refman/8.1/en/sql-statements.html","title":"Documentation"},{"location":"wxd-reference-documentation/#documentation","text":"The following links provide more information on the components in this lab. watsonx.data - https://www.ibm.com/docs/en/watsonxdata/1.0.x Presto SQL - https://prestodb.io/docs/current/sql.html Presto Console - https://prestodb.io/docs/current/admin/web-interface.html MinIO - https://min.io/docs/minio/linux/administration/minio-console.html MinIO CLI - https://min.io/docs/minio/linux/reference/minio-mc.html Apache Superset - https://superset.apache.org/docs/creating-charts-dashboards/exploring-data dBeaver - https://dbeaver.com/docs/wiki/Application-Window-Overview/ Db2 SQL - https://www.ibm.com/docs/en/db2/11.5?topic=queries-select-statement PostgreSQL SQL - https://www.postgresql.org/docs/current/sql.html MySQL SQL - https://dev.mysql.com/doc/refman/8.1/en/sql-statements.html","title":"Documentation"},{"location":"wxd-reference-ibmid/","text":"Requesting an IBM Userid. An IBMid is needed to access IBM Technology Zone. If you do not have an IBMid, click on the following link. https://techzone.ibm.com You should see the following login screen for TechZone. Click on the `Create an IBMid`` button and proceed to fill in the details on this form: Once you have verified your account, you can continue onto logging into the TechZone server.","title":"Requesting an IBMid"},{"location":"wxd-reference-ibmid/#requesting-an-ibm-userid","text":"An IBMid is needed to access IBM Technology Zone. If you do not have an IBMid, click on the following link. https://techzone.ibm.com You should see the following login screen for TechZone. Click on the `Create an IBMid`` button and proceed to fill in the details on this form: Once you have verified your account, you can continue onto logging into the TechZone server.","title":"Requesting an IBM Userid."},{"location":"wxd-reference-passwords/","text":"Passwords This table lists the passwords for the services that have \"fixed\" userids and passwords. Service Userid Password Virtual Machine watsonx watsonx.data Virtual Machine root watsonx.data watsonx.data UI ibmlhadmin password Jupyter Notebook none watsonx.data Presto ibmlhadmin password Minio Generated Generated Postgres admin Generated Apache Superset admin admin Portainer admin watsonx.data Db2 db2inst1 db2inst1 MySQL root password VNC Windows none watsonx. VNC OSX none watsonx.data Use the following commands to get the generated userid and password for MinIO. export LH_S3_ACCESS_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_ACCESS_KEY | sed 's/.*=//') export LH_S3_SECRET_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_SECRET_KEY | sed 's/.*=//') echo \"MinIO Userid : \" $LH_S3_ACCESS_KEY echo \"MinIO Password: \" $LH_S3_SECRET_KEY Use the following command to get the password for Postgres. export POSTGRES_PASSWORD=$(docker exec ibm-lh-postgres printenv | grep POSTGRES_PASSWORD | sed 's/.*=//') echo \"Postgres Userid : admin\" echo \"Postgres Password : \" $POSTGRES_PASSWORD You can get all passwords for the system when you are logged by issuing the following command: cat /certs/passwords If the passwords do not appear to work, you may need to regenerate them. The following must be run as the root user. sudo su - passwords The passwords command will refresh the passwords and also display them. If this command is not run as root, an error message will be displayed because the password file cannot be updated as the watsonx user.","title":"Userids and Passwords"},{"location":"wxd-reference-passwords/#passwords","text":"This table lists the passwords for the services that have \"fixed\" userids and passwords. Service Userid Password Virtual Machine watsonx watsonx.data Virtual Machine root watsonx.data watsonx.data UI ibmlhadmin password Jupyter Notebook none watsonx.data Presto ibmlhadmin password Minio Generated Generated Postgres admin Generated Apache Superset admin admin Portainer admin watsonx.data Db2 db2inst1 db2inst1 MySQL root password VNC Windows none watsonx. VNC OSX none watsonx.data Use the following commands to get the generated userid and password for MinIO. export LH_S3_ACCESS_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_ACCESS_KEY | sed 's/.*=//') export LH_S3_SECRET_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_SECRET_KEY | sed 's/.*=//') echo \"MinIO Userid : \" $LH_S3_ACCESS_KEY echo \"MinIO Password: \" $LH_S3_SECRET_KEY Use the following command to get the password for Postgres. export POSTGRES_PASSWORD=$(docker exec ibm-lh-postgres printenv | grep POSTGRES_PASSWORD | sed 's/.*=//') echo \"Postgres Userid : admin\" echo \"Postgres Password : \" $POSTGRES_PASSWORD You can get all passwords for the system when you are logged by issuing the following command: cat /certs/passwords If the passwords do not appear to work, you may need to regenerate them. The following must be run as the root user. sudo su - passwords The passwords command will refresh the passwords and also display them. If this command is not run as root, an error message will be displayed because the password file cannot be updated as the watsonx user.","title":"Passwords"},{"location":"wxd-reference-portainer/","text":"Portainer This lab system has Portainer installed. Portainer provides an administrative interface to the Docker images that are running on this system. You can use this console to check that all the containers are running and see what resources they are using. Your TechZone reservation will include the server name and port number to use when connecting to Portainer. Open your browser and navigate to: Portainer console - https://region.techzone-server.com:port Credentials: userid: admin password: watsonx.data Once you have logged in, you should select \"Get Started\". The next screen displays the main control panel for Portainer. Select the Local server. This screen provides details on the containers, images, volumes, and networks that make up your docker installation. To view the containers that are running, select the container icon. From within this view, you can view the details of any container, including the environment settings, the current logs, and allow you to shell into the environment. For more details on Portainer, see the Portainer documentation .","title":"Portainer Console"},{"location":"wxd-reference-portainer/#portainer","text":"This lab system has Portainer installed. Portainer provides an administrative interface to the Docker images that are running on this system. You can use this console to check that all the containers are running and see what resources they are using. Your TechZone reservation will include the server name and port number to use when connecting to Portainer. Open your browser and navigate to: Portainer console - https://region.techzone-server.com:port Credentials: userid: admin password: watsonx.data Once you have logged in, you should select \"Get Started\". The next screen displays the main control panel for Portainer. Select the Local server. This screen provides details on the containers, images, volumes, and networks that make up your docker installation. To view the containers that are running, select the container icon. From within this view, you can view the details of any container, including the environment settings, the current logs, and allow you to shell into the environment. For more details on Portainer, see the Portainer documentation .","title":"Portainer"},{"location":"wxd-reference-ports/","text":"Watsonx.data Ports The top of your TechZone reservation will contain a list of published services at the top of the details page. These URLs and Port numbers are needed to access the watsonx.data services. The list will contain the following information: SSH for watsonx userid - ssh -p 20200 watsonx@region.techzone-services.com MySQL Port - Server: region.techzone-services.com Port: 21409 PostgreSQL Port - Server: region.techzone-services.com Port: 38052 VNC Service - vnc://region.techzone-services.com:38725 Portainer console - https://region.techzone-services.com:44449 Apache Superset - http://region.techzone-services.com:41471 Presto console - https://region.techzone-services.com:49618 Presto Port - Server: region.techzone-services.com Port: 49618 Jupyter Notebook - Server: http://region.techzone-services.com:25490/notebooks/Table_of_Contents.ipynb Minio Endpoint - Server: region.techzone-services.com Port: 29652 Minio console - http://region.techzone-services.com:45050 Hive Thrift URL - thrift://region.techzone-services.com:22211 Watsonx UI - https://region.techzone-services.com:37997 Db2 Port - Server: region.techzone-services.com Port: 21361 There are two additional ports which are available for use with any service you install in the system. These ports are: Open Port 1 - Server: region.techzone-services.com: Port: 45779 Open Port 2 - Server: region.techzone-services.com: Port: 43151 The server URL will be different for each region and data center that your machine is provisioned on. The server name is usually in the format: region.techzone-services.com:port The port number that is provided in the reservation is mapped to the proper port number in the server. For instance, the Db2 Port number is 50000 in the server, but the reservation above shows a port number of 21361. Use the following rules for determining what server name and port number to use when connecting to the databases: If you are using a program \"inside\" the watsonx server, the host is watsonxdata or localhost . Some systems will require the watsonx service name (ibm-lh-postgres) and these are highlighted in the documentation. The port number will always be the native port (i.e. 5432 for PostgreSQL). If you are using a program \"outside\" the watsonx server, the host is the one provided in your reservation region.techzone-services.com and the port will the one that is included as part of the URL (i.e. 49618 in the example above). Watsonx.data Open Ports The following URLs and Ports are used to access the watsonx.data services. Most browsers will work with these URLs. Note : Mac OSX users should be aware that accessing the MinIO console may occasionally cause Firefox and Chrome to lock up. If you find that this occurs, try using Safari which appears to work fine. The ports that are used in the lab listed below, including their available when you first access the lab. The internal port number is always the same when running in the VMware image using the VM Remote Console. When using your workstation's browser, you will need to use the server name and port number supplied in the TechZone reservation. Service Port Active watsonx.data management console 9443 Yes Presto console 8443 Yes MinIO console (S3 buckets) 9001 Yes MinIO S3 Endpoint 9000 Yes Portainer (Docker container management) 6443 Yes Apache Superset (Query and Graphing) 8088 No Jupyter Notebook 8888 Yes Presto External Port 8443 Yes Hive metadata Port 9043 Yes MySQL External Port 3306 Yes Postgres External Port 5432 Yes Db2 Database Port 50000 Yes VNC Port 5901 No Note : The following ports are not active unless the service is started: Apache Superset (8088) VNC Terminal Display (5901) There are three https links will result in a Certificate error in Firefox: watsonx.data UI Presto UI Portainer UI Follow these steps to ignore the error when accessing these URLs. Select Advanced. Choose \"Accept the Risk and Continue\". If you are using Google Chrome, you can bypass the error message by typing in \"thisisunsafe\" or clicking on the \"Proceed to 192.168.252.2 (unsafe)\" link.","title":"Available Ports"},{"location":"wxd-reference-ports/#watsonxdata-ports","text":"The top of your TechZone reservation will contain a list of published services at the top of the details page. These URLs and Port numbers are needed to access the watsonx.data services. The list will contain the following information: SSH for watsonx userid - ssh -p 20200 watsonx@region.techzone-services.com MySQL Port - Server: region.techzone-services.com Port: 21409 PostgreSQL Port - Server: region.techzone-services.com Port: 38052 VNC Service - vnc://region.techzone-services.com:38725 Portainer console - https://region.techzone-services.com:44449 Apache Superset - http://region.techzone-services.com:41471 Presto console - https://region.techzone-services.com:49618 Presto Port - Server: region.techzone-services.com Port: 49618 Jupyter Notebook - Server: http://region.techzone-services.com:25490/notebooks/Table_of_Contents.ipynb Minio Endpoint - Server: region.techzone-services.com Port: 29652 Minio console - http://region.techzone-services.com:45050 Hive Thrift URL - thrift://region.techzone-services.com:22211 Watsonx UI - https://region.techzone-services.com:37997 Db2 Port - Server: region.techzone-services.com Port: 21361 There are two additional ports which are available for use with any service you install in the system. These ports are: Open Port 1 - Server: region.techzone-services.com: Port: 45779 Open Port 2 - Server: region.techzone-services.com: Port: 43151 The server URL will be different for each region and data center that your machine is provisioned on. The server name is usually in the format: region.techzone-services.com:port The port number that is provided in the reservation is mapped to the proper port number in the server. For instance, the Db2 Port number is 50000 in the server, but the reservation above shows a port number of 21361. Use the following rules for determining what server name and port number to use when connecting to the databases: If you are using a program \"inside\" the watsonx server, the host is watsonxdata or localhost . Some systems will require the watsonx service name (ibm-lh-postgres) and these are highlighted in the documentation. The port number will always be the native port (i.e. 5432 for PostgreSQL). If you are using a program \"outside\" the watsonx server, the host is the one provided in your reservation region.techzone-services.com and the port will the one that is included as part of the URL (i.e. 49618 in the example above).","title":"Watsonx.data Ports"},{"location":"wxd-reference-ports/#watsonxdata-open-ports","text":"The following URLs and Ports are used to access the watsonx.data services. Most browsers will work with these URLs. Note : Mac OSX users should be aware that accessing the MinIO console may occasionally cause Firefox and Chrome to lock up. If you find that this occurs, try using Safari which appears to work fine. The ports that are used in the lab listed below, including their available when you first access the lab. The internal port number is always the same when running in the VMware image using the VM Remote Console. When using your workstation's browser, you will need to use the server name and port number supplied in the TechZone reservation. Service Port Active watsonx.data management console 9443 Yes Presto console 8443 Yes MinIO console (S3 buckets) 9001 Yes MinIO S3 Endpoint 9000 Yes Portainer (Docker container management) 6443 Yes Apache Superset (Query and Graphing) 8088 No Jupyter Notebook 8888 Yes Presto External Port 8443 Yes Hive metadata Port 9043 Yes MySQL External Port 3306 Yes Postgres External Port 5432 Yes Db2 Database Port 50000 Yes VNC Port 5901 No Note : The following ports are not active unless the service is started: Apache Superset (8088) VNC Terminal Display (5901) There are three https links will result in a Certificate error in Firefox: watsonx.data UI Presto UI Portainer UI Follow these steps to ignore the error when accessing these URLs. Select Advanced. Choose \"Accept the Risk and Continue\". If you are using Google Chrome, you can bypass the error message by typing in \"thisisunsafe\" or clicking on the \"Proceed to 192.168.252.2 (unsafe)\" link.","title":"Watsonx.data Open Ports"},{"location":"wxd-reference-ssh/","text":"SSH Access All the commands in the lab will require you execute commands in a terminal window. Access to a terminal window can be accomplished in the three ways as described below: Use the SSH Command Locally Use Jupyter notebook terminal Terminal window in the VM If you are unable to use ssh because of connection restrictions, the Jupyter notebook option is the easiest one to use. SSH Command Your TechZone reservation will include the server name and port number to use when connecting using ssh. The port number is referred to as port in the command below, while the server will be referred to as region.techzone-server.com . Replace these values with those found in your reservation. You have the choice of using the VM Remote console and logging in as the watsonx user to issues commands, or using a local terminal shell (iTerm, Hyper, terminal) to run commands against the watsonx.data server. You can have multiple connections into the machine at any one time. It will be easier to cut-and-paste commands into a local terminal shell. The VM Remote Console does not support cut-and-paste operation from outside the virtual console environment. Open a terminal window and use the following syntax to connect as the watsonx userid. ssh -p port watsonx@region.techzone-server.com The port number and server name are provided as part of the TechZone reservation details. To become the root user, issue the following command. sudo su - Password for both users is watsonx.data . Jupyter Notebook Terminal The Jupyter Notebook lab environment provided as part of lab also provides a way of issuing terminal commands. To access this environment, you must find the Jupyter notebook URL provided in the lab reservation. When you initially open the link, it will request a password to view the Table of Contents: The default password for the notebook is watsonx.data . Once you enter the password, the Table of Contents will be displayed. Select the File menu at the top of the screen and then select New - Terminal. A terminal window will be displayed in the browser. You are now using a terminal session inside the watsonx.data server. By default, you are already the root user, so there is no need to run a sudo su - command. Note that you are not in the correct directory to run commands. You must issue the following command to be in the command directory. cd /root/ibm-lh-dev/bin All the commands in the lab can now be run from within this browser rather than using the VM Remote console. If at any time you accidentally close this window, you can open another one using the Jupyter notebook File menu. Terminal Window in the VM If you use the Remote VM Console , you can log into the watsonx.data user and use a Terminal shell to run commands against the watsonx.data server. Select the Terminal application in the virtual machine to issue commands. This will open up the terminal window. At this point you are connected as the watsonx user. You can ignore any lab instructions that ask you to ssh into the watsonx server. To become the root user, you must enter the following command in the terminal window. sudo su - Now as the root user you will be ready to run the commands found in the lab. Copying Files If you need to move files into or out of the virtual machine, you can use the following commands. To copy a file into the virtual machine use the following syntax: scp -P port myfile.txt watsonx@region.techzone-server.com:/tmp/myfile.txt The filename myfile.txt will be copied to the /tmp directory. The temporary directory is useful since you can copy the file to multiple places from within the Linux environment. Multiple files can be moved by using wildcard characters using the following syntax: scp -P port myfile.* watsonx@region.techzone-server.com:/tmp To move files from the image back to your local system requires you reverse the file specification. scp -P port watsonx@region.techzone-server.com:/tmp/myfile.txt /Downloads/myfile.txt You can also use wildcards to select more than one file.","title":"SSH and SCP Commands"},{"location":"wxd-reference-ssh/#ssh-access","text":"All the commands in the lab will require you execute commands in a terminal window. Access to a terminal window can be accomplished in the three ways as described below: Use the SSH Command Locally Use Jupyter notebook terminal Terminal window in the VM If you are unable to use ssh because of connection restrictions, the Jupyter notebook option is the easiest one to use.","title":"SSH Access"},{"location":"wxd-reference-ssh/#ssh-command","text":"Your TechZone reservation will include the server name and port number to use when connecting using ssh. The port number is referred to as port in the command below, while the server will be referred to as region.techzone-server.com . Replace these values with those found in your reservation. You have the choice of using the VM Remote console and logging in as the watsonx user to issues commands, or using a local terminal shell (iTerm, Hyper, terminal) to run commands against the watsonx.data server. You can have multiple connections into the machine at any one time. It will be easier to cut-and-paste commands into a local terminal shell. The VM Remote Console does not support cut-and-paste operation from outside the virtual console environment. Open a terminal window and use the following syntax to connect as the watsonx userid. ssh -p port watsonx@region.techzone-server.com The port number and server name are provided as part of the TechZone reservation details. To become the root user, issue the following command. sudo su - Password for both users is watsonx.data .","title":"SSH Command"},{"location":"wxd-reference-ssh/#jupyter-notebook-terminal","text":"The Jupyter Notebook lab environment provided as part of lab also provides a way of issuing terminal commands. To access this environment, you must find the Jupyter notebook URL provided in the lab reservation. When you initially open the link, it will request a password to view the Table of Contents: The default password for the notebook is watsonx.data . Once you enter the password, the Table of Contents will be displayed. Select the File menu at the top of the screen and then select New - Terminal. A terminal window will be displayed in the browser. You are now using a terminal session inside the watsonx.data server. By default, you are already the root user, so there is no need to run a sudo su - command. Note that you are not in the correct directory to run commands. You must issue the following command to be in the command directory. cd /root/ibm-lh-dev/bin All the commands in the lab can now be run from within this browser rather than using the VM Remote console. If at any time you accidentally close this window, you can open another one using the Jupyter notebook File menu.","title":"Jupyter Notebook Terminal"},{"location":"wxd-reference-ssh/#terminal-window-in-the-vm","text":"If you use the Remote VM Console , you can log into the watsonx.data user and use a Terminal shell to run commands against the watsonx.data server. Select the Terminal application in the virtual machine to issue commands. This will open up the terminal window. At this point you are connected as the watsonx user. You can ignore any lab instructions that ask you to ssh into the watsonx server. To become the root user, you must enter the following command in the terminal window. sudo su - Now as the root user you will be ready to run the commands found in the lab.","title":"Terminal Window in the VM"},{"location":"wxd-reference-ssh/#copying-files","text":"If you need to move files into or out of the virtual machine, you can use the following commands. To copy a file into the virtual machine use the following syntax: scp -P port myfile.txt watsonx@region.techzone-server.com:/tmp/myfile.txt The filename myfile.txt will be copied to the /tmp directory. The temporary directory is useful since you can copy the file to multiple places from within the Linux environment. Multiple files can be moved by using wildcard characters using the following syntax: scp -P port myfile.* watsonx@region.techzone-server.com:/tmp To move files from the image back to your local system requires you reverse the file specification. scp -P port watsonx@region.techzone-server.com:/tmp/myfile.txt /Downloads/myfile.txt You can also use wildcards to select more than one file.","title":"Copying Files"},{"location":"wxd-reference-techzone/","text":"Requesting a TechZone image If you are part of a workshop, you do not have to request a reservation. Instead, go to the Accessing a Workshop section. Log into TechZone ( https://techzone.ibm.com ) and search for the watsonx.data Developer Base Image or use the following link. https://techzone.ibm.com/collection/ibm-watsonxdata-developer-base-image If you have not logged into the IBM Cloud site, you will be asked to authenticate with your IBM userid. If you do not have an IBM userid, you will need to register for one. This lab is open to IBMers and Business Partners. Once you have logged in, you should see the following. Select the Environment tab on the far-left side. Note : There may be more than one environment available. Choose the one best suited for your requirements. Press the Reserve button. Select \"reserve now\" (why wait?). For \"Purpose\" select Self Education. This will expand to request additional information. Fill in the purpose field with something meaningful (watsonx.data education). Next select preferred Geography for the image. Choose any of the regions that are closest to your location. Note : The TechZone scheduler will pick a location in your region that has capacity to deploy your image. Previously you needed to pick a physical location (DAL10, WDC04, TOK02, etc...). The number of locations has expanded to 4 North American, 4 European and 2 AP locations which will hopefully provide more capacity to deploy the lab. If you find that your reservation is not being provisioned, check the status of the TechZone environment by referring to the TechZone status page at https://techzone.status.io . Next select the end date for the lab. Make sure you select enough time for you to use the lab! It defaults to 2 days, but you can extend the reservation! You do not need to enable VPN Access . Once you have completed the form, check the box indicating that you agree to the terms and conditions of using TechZone, and click SUBMIT on the bottom right-hand corner. At this point you will need to wait patiently for an email that acknowledges that your request has been placed into Provisioning mode. Eventually you will receive an email confirming that the system is ready to be used. Note that this can take a number of hours depending on the load on the TechZone servers. You may also get a message telling you that the system provisioning has Failed. Ignore the reason field since it is usually related to an environment failure caused by lack of resources. Check the status of TechZone first ( https://techzone.status.io ). If the systems appear to be okay, try requesting another image or using a different server location if possible. Contact TechZone support if you are having difficulties provisioning a system.","title":"Requesting an image"},{"location":"wxd-reference-techzone/#requesting-a-techzone-image","text":"If you are part of a workshop, you do not have to request a reservation. Instead, go to the Accessing a Workshop section. Log into TechZone ( https://techzone.ibm.com ) and search for the watsonx.data Developer Base Image or use the following link. https://techzone.ibm.com/collection/ibm-watsonxdata-developer-base-image If you have not logged into the IBM Cloud site, you will be asked to authenticate with your IBM userid. If you do not have an IBM userid, you will need to register for one. This lab is open to IBMers and Business Partners. Once you have logged in, you should see the following. Select the Environment tab on the far-left side. Note : There may be more than one environment available. Choose the one best suited for your requirements. Press the Reserve button. Select \"reserve now\" (why wait?). For \"Purpose\" select Self Education. This will expand to request additional information. Fill in the purpose field with something meaningful (watsonx.data education). Next select preferred Geography for the image. Choose any of the regions that are closest to your location. Note : The TechZone scheduler will pick a location in your region that has capacity to deploy your image. Previously you needed to pick a physical location (DAL10, WDC04, TOK02, etc...). The number of locations has expanded to 4 North American, 4 European and 2 AP locations which will hopefully provide more capacity to deploy the lab. If you find that your reservation is not being provisioned, check the status of the TechZone environment by referring to the TechZone status page at https://techzone.status.io . Next select the end date for the lab. Make sure you select enough time for you to use the lab! It defaults to 2 days, but you can extend the reservation! You do not need to enable VPN Access . Once you have completed the form, check the box indicating that you agree to the terms and conditions of using TechZone, and click SUBMIT on the bottom right-hand corner. At this point you will need to wait patiently for an email that acknowledges that your request has been placed into Provisioning mode. Eventually you will receive an email confirming that the system is ready to be used. Note that this can take a number of hours depending on the load on the TechZone servers. You may also get a message telling you that the system provisioning has Failed. Ignore the reason field since it is usually related to an environment failure caused by lack of resources. Check the status of TechZone first ( https://techzone.status.io ). If the systems appear to be okay, try requesting another image or using a different server location if possible. Contact TechZone support if you are having difficulties provisioning a system.","title":"Requesting a TechZone image"},{"location":"wxd-reference-vnc/","text":"Using the VM Remote Console The watsonx server that has been provisioned has no physical monitor attached to it (headless is what it is commonly referred to) and so we need to use a different technique to view the desktop of the main user or the system (watsonx). The first thing to consider is whether you need to use the VM Remote Console at all. All the services like the watsonx.data UI, MinIO, Presto, Apache Superset and Portainer, are all web-based servers which means you just need to use your own browser to access these programs. Connecting into the watsonx virtual machine can be done using the secure shell command (ssh) which provides access to all the low-level commands you might need to use like starting the Apache Superset service. Note that Apache Superset is not up and running by default, so you will need to start it before attempting to connect to it. So what's the VM Remote Console required for? One program that has been provided to view the database schemas is dBeaver, a community edition of software that provides a query interface to 100's of data sources, including the watsonx.data environment. You can only use this program using the VM Remote Console. You do have the option of installing this software on your own machine if you wish. Find your email message that contains details of your reservation. Details of what the reservations and the page containing details of the reservation can be found in the Accessing the reservation section. Once the details appear, scroll down to the bottom of the web page, and you will see the VM Remote Console button. You can access the logon screen of the virtual machine by pressing the VM Remote Console button. Clicking on this button will display the logon screen for the server. Select the watsonx user and use watsonx.data as the password. You can open this window in a separate browser window, or place it into fullscreen mode. Note that you may need to increase the size of your browser window (or change the scaling in the browser) to see all the virtual desktop. At this point you have access to the desktop of the watsonx user and can issue commands from within this environment. As mentioned previously, you do not need to use this interface to use the lab. Enabling VNC Access The watsonx.data image has port 5901 exposed for use with VNC browsers. If you want to use VNC instead of the VM Remote Console access, you must do the following: Make sure you are not currently logged in using the VM Virtual Console You have suitable VNC software (Mac OSX includes this). Use RealVNC or UltraVNC on a Windows box. You have a Terminal Shell open to issue root commands In a terminal window, ssh into the watsonx.data virtual machine as the watsonx user. Then you will need to become the root user and issue the following commands: sudo su - systemctl enable vncserver@:1 systemctl start vncserver@:1 systemctl daemon-reload After these commands complete, you will not be able to use the VM Remote Console to connect to the watsonx userid. Instead, you will need to use your VNC software to connect to the server. If at any time you want to turn off VNC support, issue the following commands: sudo su - systemctl disable vncserver@:1 systemctl stop vncserver@:1 systemctl daemon-reload Access watsonx.data on a Mac OSX system Once the VNC service has been started, you can connect to the machine using the VNC service by using the URL provided in your reservation document (sample URL below): VNC Service - vnc://region.techzone-server.com:28314 Use the Mac screen sharing app to connect to watsonx.data. You can connect using the OSX Safari browser by using the URL provided above. It will automatically start the screen sharing application. Note : The VNC URL format is only valid in Safari and may not work in other browsers. When the service connects to the server it will prompt for the password of the watsonx user - watsonx.data . Once connected you will see the console of the watsonx user. You may also want to consider making the screen size larger. Use the drop-down menu (Applications) at the top of the screen to select Other -> Settings. In the Devices section of the Setting menu, select Displays and choose a resolution that is suitable for your environment. Access watsonx.data on a Windows system Windows does not supply a native VNC browser. You will need to install a program like RealVNC or UltraVNC to access the console. Directions for installing UltraVNC are shown below. Note : This software has not been officially approved for use on Windows, although it has been tested against the watsonx.data server. UltraVNC UltraVNC is a VNC browser for the Windows environment. This is an open-source offering that can be found on Github in the UltraVNC project . The developers of this code have additional offerings that they sell for a fee and use advertising on their site to support their work on this and other projects. Because of the way the website has been designed, there are a number of ads and buttons that might distract you from the actual product you want to download. The official web page of UltraVNC is https://uvnc.com/ . Instead of going through all the menus, use the following link to go directly to the download screen: UltraVNC Download Link The website will display an initial download page that will wait for 10 seconds before you can do anything. This gives you time to read the advertising or make a voluntary donation to their coding efforts. Once the wait time has expired, you will see the following screen: Make sure to check that you accept the above conditions and press the download button. You should see the download progress in your browser. Click \"Open File\" against the downloaded file. Once the installer starts it will ask for you to approve changes to the system. Press Yes and then select the language you want. Press OK. Select Accept the license agreement and press Next. The summary of what it is going to install is shown on this screen. You will then have to provide the installation location or use the default location for the code. The next panel asks which components you want to install. Only install the viewer. Then it requests the name of the Shortcut folder. Just keep the name it recommends. Press Next. The next panel comes up that asks if you want a desktop shortcut. Probably a good idea if you are going to use it for a long period of time. Notice how they make this install take longer than expected to get you to donate? After all of that you will see the final installation screen. Click on Install and eventually you will get the completion notice. Pressing Finish will give you one final chance to donate. Make sure to unselect \"Show latest releases\" or else you will be directed back to their website. Using UltraVNC Start UltraVNC viewer by scrolling through your applications on your desktop. Choose the UltraVNC Viewer (not the listen mode ones unless you want to watch someone else using the desktop). When the service starts, it will ask for the server and port (Example below). VNC Service - vnc://region.techzone-server.com:28314 For the server you would enter region.techzone-server.com and the port would be 28314 . The examples below assume the IP address of 192.168.252.2 with a port number of 5901 . Before hitting enter, you may want to select the options button and change the setting in Miscellaneous to prevent advertising from being displayed while working with this program. Press connect when done. The password for the service is watsonx. which is watsonx with a period . at the end. At this point you should see the desktop of the virtual machine. You are now connected and can work on the watsonx.data desktop. If you find that performance is sluggish, this may be due to network latency. In the settings toolbar of the UltraVNC window, change the color resolution from Full to 256 . Your screen may look a bit washed out, but this will reduce the amount of data that needs to be sent over the network to render your screen.","title":"Using the VM Remote Console"},{"location":"wxd-reference-vnc/#using-the-vm-remote-console","text":"The watsonx server that has been provisioned has no physical monitor attached to it (headless is what it is commonly referred to) and so we need to use a different technique to view the desktop of the main user or the system (watsonx). The first thing to consider is whether you need to use the VM Remote Console at all. All the services like the watsonx.data UI, MinIO, Presto, Apache Superset and Portainer, are all web-based servers which means you just need to use your own browser to access these programs. Connecting into the watsonx virtual machine can be done using the secure shell command (ssh) which provides access to all the low-level commands you might need to use like starting the Apache Superset service. Note that Apache Superset is not up and running by default, so you will need to start it before attempting to connect to it. So what's the VM Remote Console required for? One program that has been provided to view the database schemas is dBeaver, a community edition of software that provides a query interface to 100's of data sources, including the watsonx.data environment. You can only use this program using the VM Remote Console. You do have the option of installing this software on your own machine if you wish. Find your email message that contains details of your reservation. Details of what the reservations and the page containing details of the reservation can be found in the Accessing the reservation section. Once the details appear, scroll down to the bottom of the web page, and you will see the VM Remote Console button. You can access the logon screen of the virtual machine by pressing the VM Remote Console button. Clicking on this button will display the logon screen for the server. Select the watsonx user and use watsonx.data as the password. You can open this window in a separate browser window, or place it into fullscreen mode. Note that you may need to increase the size of your browser window (or change the scaling in the browser) to see all the virtual desktop. At this point you have access to the desktop of the watsonx user and can issue commands from within this environment. As mentioned previously, you do not need to use this interface to use the lab.","title":"Using the VM Remote Console"},{"location":"wxd-reference-vnc/#enabling-vnc-access","text":"The watsonx.data image has port 5901 exposed for use with VNC browsers. If you want to use VNC instead of the VM Remote Console access, you must do the following: Make sure you are not currently logged in using the VM Virtual Console You have suitable VNC software (Mac OSX includes this). Use RealVNC or UltraVNC on a Windows box. You have a Terminal Shell open to issue root commands In a terminal window, ssh into the watsonx.data virtual machine as the watsonx user. Then you will need to become the root user and issue the following commands: sudo su - systemctl enable vncserver@:1 systemctl start vncserver@:1 systemctl daemon-reload After these commands complete, you will not be able to use the VM Remote Console to connect to the watsonx userid. Instead, you will need to use your VNC software to connect to the server. If at any time you want to turn off VNC support, issue the following commands: sudo su - systemctl disable vncserver@:1 systemctl stop vncserver@:1 systemctl daemon-reload","title":"Enabling VNC Access"},{"location":"wxd-reference-vnc/#access-watsonxdata-on-a-mac-osx-system","text":"Once the VNC service has been started, you can connect to the machine using the VNC service by using the URL provided in your reservation document (sample URL below): VNC Service - vnc://region.techzone-server.com:28314 Use the Mac screen sharing app to connect to watsonx.data. You can connect using the OSX Safari browser by using the URL provided above. It will automatically start the screen sharing application. Note : The VNC URL format is only valid in Safari and may not work in other browsers. When the service connects to the server it will prompt for the password of the watsonx user - watsonx.data . Once connected you will see the console of the watsonx user. You may also want to consider making the screen size larger. Use the drop-down menu (Applications) at the top of the screen to select Other -> Settings. In the Devices section of the Setting menu, select Displays and choose a resolution that is suitable for your environment.","title":"Access watsonx.data on a Mac OSX system"},{"location":"wxd-reference-vnc/#access-watsonxdata-on-a-windows-system","text":"Windows does not supply a native VNC browser. You will need to install a program like RealVNC or UltraVNC to access the console. Directions for installing UltraVNC are shown below. Note : This software has not been officially approved for use on Windows, although it has been tested against the watsonx.data server.","title":"Access watsonx.data on a Windows system"},{"location":"wxd-reference-vnc/#ultravnc","text":"UltraVNC is a VNC browser for the Windows environment. This is an open-source offering that can be found on Github in the UltraVNC project . The developers of this code have additional offerings that they sell for a fee and use advertising on their site to support their work on this and other projects. Because of the way the website has been designed, there are a number of ads and buttons that might distract you from the actual product you want to download. The official web page of UltraVNC is https://uvnc.com/ . Instead of going through all the menus, use the following link to go directly to the download screen: UltraVNC Download Link The website will display an initial download page that will wait for 10 seconds before you can do anything. This gives you time to read the advertising or make a voluntary donation to their coding efforts. Once the wait time has expired, you will see the following screen: Make sure to check that you accept the above conditions and press the download button. You should see the download progress in your browser. Click \"Open File\" against the downloaded file. Once the installer starts it will ask for you to approve changes to the system. Press Yes and then select the language you want. Press OK. Select Accept the license agreement and press Next. The summary of what it is going to install is shown on this screen. You will then have to provide the installation location or use the default location for the code. The next panel asks which components you want to install. Only install the viewer. Then it requests the name of the Shortcut folder. Just keep the name it recommends. Press Next. The next panel comes up that asks if you want a desktop shortcut. Probably a good idea if you are going to use it for a long period of time. Notice how they make this install take longer than expected to get you to donate? After all of that you will see the final installation screen. Click on Install and eventually you will get the completion notice. Pressing Finish will give you one final chance to donate. Make sure to unselect \"Show latest releases\" or else you will be directed back to their website.","title":"UltraVNC"},{"location":"wxd-reference-vnc/#using-ultravnc","text":"Start UltraVNC viewer by scrolling through your applications on your desktop. Choose the UltraVNC Viewer (not the listen mode ones unless you want to watch someone else using the desktop). When the service starts, it will ask for the server and port (Example below). VNC Service - vnc://region.techzone-server.com:28314 For the server you would enter region.techzone-server.com and the port would be 28314 . The examples below assume the IP address of 192.168.252.2 with a port number of 5901 . Before hitting enter, you may want to select the options button and change the setting in Miscellaneous to prevent advertising from being displayed while working with this program. Press connect when done. The password for the service is watsonx. which is watsonx with a period . at the end. At this point you should see the desktop of the virtual machine. You are now connected and can work on the watsonx.data desktop. If you find that performance is sluggish, this may be due to network latency. In the settings toolbar of the UltraVNC window, change the color resolution from Full to 256 . Your screen may look a bit washed out, but this will reduce the amount of data that needs to be sent over the network to render your screen.","title":"Using UltraVNC"},{"location":"wxd-reference-workshop/","text":"Accessing a Workshop To access a watsonx.data workshop, you will need to have an IBM userid and a link provided to your class instructor. This link will first ask you to log into the system using your IBM userid, and then a screen similar to the following will be displayed: The title of the workshop may be different, but the steps to access the lab will remain the same. The class instructor will have provided a unique password for the course. Enter that value into the password/access code box and click on the Submit button. When the connection is successful, the details of your environment will be shown in the browser. The top of the page contains all the published services that you will use during the lab. For instance, if the lab requires that you access the Presto console, you would click on the link in this browser which says: Presto console - https://useast.techzone-services.com:xxxxx At the bottom of the reservation you will find a button that provides access to the machine console. The large blue button labelled VM Remote Console will provide access to the Linux operating system that the watsonx.data server is running on. See the section on VM Remote Console","title":"Accessing a workshop"},{"location":"wxd-reference-workshop/#accessing-a-workshop","text":"To access a watsonx.data workshop, you will need to have an IBM userid and a link provided to your class instructor. This link will first ask you to log into the system using your IBM userid, and then a screen similar to the following will be displayed: The title of the workshop may be different, but the steps to access the lab will remain the same. The class instructor will have provided a unique password for the course. Enter that value into the password/access code box and click on the Submit button. When the connection is successful, the details of your environment will be shown in the browser. The top of the page contains all the published services that you will use during the lab. For instance, if the lab requires that you access the Presto console, you would click on the link in this browser which says: Presto console - https://useast.techzone-services.com:xxxxx At the bottom of the reservation you will find a button that provides access to the machine console. The large blue button labelled VM Remote Console will provide access to the Linux operating system that the watsonx.data server is running on. See the section on VM Remote Console","title":"Accessing a Workshop"},{"location":"wxd-revisions/","text":"Revisions February 29, 2024 (1.1.2) SSL connection for data sources You can now enable SSL connection for the following data sources by using the Add database user interface to secure and encrypt the database connection : Db2 PostgreSQL IBM Data Virtualization Manager for z/OS For IBM Data Virtualization Manager for z/OS and PostgreSQL, select Validate certificate to validate whether the SSL certificate that is returned by the host is trusted. For the IBM Data Virtualization Manager for z/OS data source, you can choose to provide the hostname in the SSL certificate. text Secure ingestion job history Now users can view only their own ingestion job history. Administrators can view the ingestion job history for all users. New data types BLOB and CLOB for SAPHANA and Teradata data sources New data types BLOB and CLOB are available for SAPHANA and Teradata data sources. You can use these data types only with SELECT statements in the Query workspace to build and run queries against your data. Use more SQL statements You can now use the following SQL statements in the Query workspace to build and run queries against your data: Apache Iceberg data sources: CREATE VIEW DROP VIEW MongoDB data sources: DELETE Create a new table during data ingestion Previously, you had to have a target table in watsonx.data for ingesting data. Now, you can create a new table directly from the source data file (available in parquet or CSV format) by using data ingestion through the watsonx.data user interface. You can create the table by using the following methods of ingestion: Ingesting data by using Iceberg copy loader Ingesting data by using Spark Perform ALTER TABLE operations on a column With an Iceberg data source, you can now perform ALTER TABLE operations on a column for the following data type conversions: int to bigint float to double decimal (num1, dec_digits) to decimal (num2, dec_digits), where num2>num1. Better query performance by using sorted files With an Iceberg data source, you can generate sorted files, which reduce the query result latency and improve the performance of Presto. Data in the Apache Iceberg table is sorted during the writing process within each file. You can configure the order to sort the data by using the sorted_by table property. When you create the table, specify the array of columns involved in sorting. Exposing Hive metastore port details (Developer edition) You can now expose the Hive metastore port details outside the watsonx.data developer edition's host to facilitate connection from external applications (services outside of docker or Podman), such as the integration with Db2, and Spark to watsonx.data. January 25, 2024 (1.1.1) Updated Lab Documentation Instructions for using a Workshop environment New section on user administration and creating policies Update to running terminal commands uses Jupyter notebook shell January 8, 2024 (1.1.1) Updated the lab to GA watsonx.data 1.1.1 code What's new in watsonx.data version 1.1.1 Reference Audit logging IBM watsonx.data now integrates with the Cloud Pak for Data audit logging service. Auditable events for watsonx.data are forwarded to the security information and event management (SIEM) solution that you integrate with. Use self-signed certificates and CA certificates to connect to object stores Previously, watsonx.data could connect to HTTPS endpoints that used certificates signed by well-known certificate authorities, such as IBM Cloud\u00ae Object Storage and Amazon S3. Now, you can connect to object stores that use self-signed certificates or certificates that are signed by other certificate authorities. Integration with Db2\u00ae and Netezza\u00ae You can now register Db2 or Netezza engines with valid console URL. You can use the metastore URL shown in Engine detail page to sync the respective engines with appropriate bucket catalog-based table. IBM Data Virtualization Manager for z/OS\u00ae connector You can use the new IBM Data Virtualization Manager for z/OS\u00ae connector to read and write IBM Z\u00ae without having to move, replicate, or transform the data. For more information, see Connecting to an IBM Data Virtualization Manager (DVM) data source. Better memory management Metastore caching and metadata caching (header and footer caching) are now enabled by default to optimize the memory usage. Also, now you can create a local staging directory to optimize the use of resources during data operations. For more information, see Enhancing the query performance through caching and Configuring a local staging directory. Presto case-sensitive behavior The Presto behavior is changed from case-insensitive to case-sensitive. Now you can provide the object names in original case format as in the database. You can also create Schemas, Tables and Columns in mixed case that is, uppercase and lowercase through Presto if the database supports it. Teradata connector is enabled for multiple ALTER TABLE statements Teradata connector now supports the ALTER TABLE RENAME TO, ALTER TABLE DROP COLUMN, ALTER TABLE RENAME COLUMN column_name TO new_column_name statements. Removal of development (*-devel) packages For security reasons, the *-devel packages are removed from watsonx.data. If you are already using the development packages, the programs that use the development packages cannot be compiled . For any queries, contact IBM Support. SSL is enabled for PostgreSQL Now ingestion can use mounted certificates when connecting to PostgreSQL. January 3, 2024 (1.1.0) Added two open ports to the image Sometimes there is a requirement to add another service to the watsonx.data image. For instance, you may want to add MongoDB or MSSQL to the system in order to demonstrate federations with these data source. Since we do not know what your requirements are, we have opened up two ports that can be assigned to any service. The documentation has been updated to describe what steps are needed to use these open up and use these ports. December 6, 2023 (1.1.0) Updated the lab to GA 1.1.0 code What's new in watsonx.data version 1.1.0 Reference Time-travel and roll-back queries You can now run the following time-travel queries to access historical data in Apache Iceberg tables: SELECT <columns> FROM <iceberg-table> FOR TIMESTAMP AS OF TIMESTAMP <timestamp> SELECT <columns> FROM <iceberg-table> FOR VERSION AS OF <snapshotId> You can use time-travel queries to query and restore data that was updated or deleted in the past. You can also roll back an Apache Iceberg table to any existing snapshot. Capture historical data about Presto queries The Query History Monitoring and Management (QHMM) service captures historical data about Presto queries and events. The historical data is stored in a MinIO bucket and you can use the data to understand the queries that were run and to debug the Presto engine. Improved query performance with Metastore, File list, and File metadata caching You can now capture and track the DDL changes in watsonx.data by using an event listener. Ingest data by using Spark You can now use the IBM Analytics Engine powered by Apache Spark to run ingestion jobs in watsonx.data. Integration with Db2 and Netezza Performance Server You can now register Db2 or Netezza Performance Server engines in watsonx.data console. New connectors You can now use connectors in watsonx.data to establish connections to the following types of databases: Teradata Delta Lake Elasticsearch SAP HANA SingleStoreDB Snowflake Teradata Db2 Upgraded to 11.5.9 What's new in Db2 11.5.9 Reference October 6, 2023 (1.0.3) Updated the lab to GA 1.0.3 code What's new in watsonx.data version 1.1.0 Image now available in 10 data centers with simpler provisioning Requesting an Image Removed VPN Requirement External URLs and Ports for all UI Services watsonx.data Ports Added PostgreSQL and MySQL databases Postgres Connection MySQL Connection Added Jupyter notebook examples Jupyter Notebook support Fixed Presto certificate to support TechZone addresses without updating /etc/hosts Watsonx.data Connection Certificate Added standalone Spark server to show connectivity to the Presto database Accessing watsonx.data with Spark Added watsonx.data Client code Watsonx.data client utilities Added MinIO CLI interface MinIO CLI Exposed external ports for MinIO, Db2, MySQL, PostgreSQL, Hive, PrestoDB watsonx.data Ports VNC Interface disabled by default Enabling VNC Access Added Ingesting data chapter Ingesting Data July 25, 2023 (1.0.1) Updated the lab to GA 1.0.1 code Automated start of watsonx.data and simplification of many of the sections Removed the Ingest section until a new version is available Added Db2 and PostgreSQL connection details June 12, 2023 (1.0.0) Clarified some commands and added an Appendix on common issues. June 6, 2023 (1.0.0) Updated instructions for new TechZone image and added Ingest lab instructions. May 25th, 2023 (1.0.0) Initial publication.","title":"What's New"},{"location":"wxd-revisions/#revisions","text":"","title":"Revisions"},{"location":"wxd-revisions/#february-29-2024-112","text":"SSL connection for data sources You can now enable SSL connection for the following data sources by using the Add database user interface to secure and encrypt the database connection : Db2 PostgreSQL IBM Data Virtualization Manager for z/OS For IBM Data Virtualization Manager for z/OS and PostgreSQL, select Validate certificate to validate whether the SSL certificate that is returned by the host is trusted. For the IBM Data Virtualization Manager for z/OS data source, you can choose to provide the hostname in the SSL certificate. text Secure ingestion job history Now users can view only their own ingestion job history. Administrators can view the ingestion job history for all users. New data types BLOB and CLOB for SAPHANA and Teradata data sources New data types BLOB and CLOB are available for SAPHANA and Teradata data sources. You can use these data types only with SELECT statements in the Query workspace to build and run queries against your data. Use more SQL statements You can now use the following SQL statements in the Query workspace to build and run queries against your data: Apache Iceberg data sources: CREATE VIEW DROP VIEW MongoDB data sources: DELETE Create a new table during data ingestion Previously, you had to have a target table in watsonx.data for ingesting data. Now, you can create a new table directly from the source data file (available in parquet or CSV format) by using data ingestion through the watsonx.data user interface. You can create the table by using the following methods of ingestion: Ingesting data by using Iceberg copy loader Ingesting data by using Spark Perform ALTER TABLE operations on a column With an Iceberg data source, you can now perform ALTER TABLE operations on a column for the following data type conversions: int to bigint float to double decimal (num1, dec_digits) to decimal (num2, dec_digits), where num2>num1. Better query performance by using sorted files With an Iceberg data source, you can generate sorted files, which reduce the query result latency and improve the performance of Presto. Data in the Apache Iceberg table is sorted during the writing process within each file. You can configure the order to sort the data by using the sorted_by table property. When you create the table, specify the array of columns involved in sorting. Exposing Hive metastore port details (Developer edition) You can now expose the Hive metastore port details outside the watsonx.data developer edition's host to facilitate connection from external applications (services outside of docker or Podman), such as the integration with Db2, and Spark to watsonx.data.","title":"February 29, 2024 (1.1.2)"},{"location":"wxd-revisions/#january-25-2024-111","text":"Updated Lab Documentation Instructions for using a Workshop environment New section on user administration and creating policies Update to running terminal commands uses Jupyter notebook shell","title":"January 25, 2024 (1.1.1)"},{"location":"wxd-revisions/#january-8-2024-111","text":"Updated the lab to GA watsonx.data 1.1.1 code What's new in watsonx.data version 1.1.1 Reference Audit logging IBM watsonx.data now integrates with the Cloud Pak for Data audit logging service. Auditable events for watsonx.data are forwarded to the security information and event management (SIEM) solution that you integrate with. Use self-signed certificates and CA certificates to connect to object stores Previously, watsonx.data could connect to HTTPS endpoints that used certificates signed by well-known certificate authorities, such as IBM Cloud\u00ae Object Storage and Amazon S3. Now, you can connect to object stores that use self-signed certificates or certificates that are signed by other certificate authorities. Integration with Db2\u00ae and Netezza\u00ae You can now register Db2 or Netezza engines with valid console URL. You can use the metastore URL shown in Engine detail page to sync the respective engines with appropriate bucket catalog-based table. IBM Data Virtualization Manager for z/OS\u00ae connector You can use the new IBM Data Virtualization Manager for z/OS\u00ae connector to read and write IBM Z\u00ae without having to move, replicate, or transform the data. For more information, see Connecting to an IBM Data Virtualization Manager (DVM) data source. Better memory management Metastore caching and metadata caching (header and footer caching) are now enabled by default to optimize the memory usage. Also, now you can create a local staging directory to optimize the use of resources during data operations. For more information, see Enhancing the query performance through caching and Configuring a local staging directory. Presto case-sensitive behavior The Presto behavior is changed from case-insensitive to case-sensitive. Now you can provide the object names in original case format as in the database. You can also create Schemas, Tables and Columns in mixed case that is, uppercase and lowercase through Presto if the database supports it. Teradata connector is enabled for multiple ALTER TABLE statements Teradata connector now supports the ALTER TABLE RENAME TO, ALTER TABLE DROP COLUMN, ALTER TABLE RENAME COLUMN column_name TO new_column_name statements. Removal of development (*-devel) packages For security reasons, the *-devel packages are removed from watsonx.data. If you are already using the development packages, the programs that use the development packages cannot be compiled . For any queries, contact IBM Support. SSL is enabled for PostgreSQL Now ingestion can use mounted certificates when connecting to PostgreSQL.","title":"January 8, 2024 (1.1.1)"},{"location":"wxd-revisions/#january-3-2024-110","text":"Added two open ports to the image Sometimes there is a requirement to add another service to the watsonx.data image. For instance, you may want to add MongoDB or MSSQL to the system in order to demonstrate federations with these data source. Since we do not know what your requirements are, we have opened up two ports that can be assigned to any service. The documentation has been updated to describe what steps are needed to use these open up and use these ports.","title":"January 3, 2024 (1.1.0)"},{"location":"wxd-revisions/#december-6-2023-110","text":"Updated the lab to GA 1.1.0 code What's new in watsonx.data version 1.1.0 Reference Time-travel and roll-back queries You can now run the following time-travel queries to access historical data in Apache Iceberg tables: SELECT <columns> FROM <iceberg-table> FOR TIMESTAMP AS OF TIMESTAMP <timestamp> SELECT <columns> FROM <iceberg-table> FOR VERSION AS OF <snapshotId> You can use time-travel queries to query and restore data that was updated or deleted in the past. You can also roll back an Apache Iceberg table to any existing snapshot. Capture historical data about Presto queries The Query History Monitoring and Management (QHMM) service captures historical data about Presto queries and events. The historical data is stored in a MinIO bucket and you can use the data to understand the queries that were run and to debug the Presto engine. Improved query performance with Metastore, File list, and File metadata caching You can now capture and track the DDL changes in watsonx.data by using an event listener. Ingest data by using Spark You can now use the IBM Analytics Engine powered by Apache Spark to run ingestion jobs in watsonx.data. Integration with Db2 and Netezza Performance Server You can now register Db2 or Netezza Performance Server engines in watsonx.data console. New connectors You can now use connectors in watsonx.data to establish connections to the following types of databases: Teradata Delta Lake Elasticsearch SAP HANA SingleStoreDB Snowflake Teradata Db2 Upgraded to 11.5.9 What's new in Db2 11.5.9 Reference","title":"December 6, 2023 (1.1.0)"},{"location":"wxd-revisions/#october-6-2023-103","text":"Updated the lab to GA 1.0.3 code What's new in watsonx.data version 1.1.0 Image now available in 10 data centers with simpler provisioning Requesting an Image Removed VPN Requirement External URLs and Ports for all UI Services watsonx.data Ports Added PostgreSQL and MySQL databases Postgres Connection MySQL Connection Added Jupyter notebook examples Jupyter Notebook support Fixed Presto certificate to support TechZone addresses without updating /etc/hosts Watsonx.data Connection Certificate Added standalone Spark server to show connectivity to the Presto database Accessing watsonx.data with Spark Added watsonx.data Client code Watsonx.data client utilities Added MinIO CLI interface MinIO CLI Exposed external ports for MinIO, Db2, MySQL, PostgreSQL, Hive, PrestoDB watsonx.data Ports VNC Interface disabled by default Enabling VNC Access Added Ingesting data chapter Ingesting Data","title":"October 6, 2023 (1.0.3)"},{"location":"wxd-revisions/#july-25-2023-101","text":"Updated the lab to GA 1.0.1 code Automated start of watsonx.data and simplification of many of the sections Removed the Ingest section until a new version is available Added Db2 and PostgreSQL connection details","title":"July 25, 2023 (1.0.1)"},{"location":"wxd-revisions/#june-12-2023-100","text":"Clarified some commands and added an Appendix on common issues.","title":"June 12, 2023 (1.0.0)"},{"location":"wxd-revisions/#june-6-2023-100","text":"Updated instructions for new TechZone image and added Ingest lab instructions.","title":"June 6, 2023 (1.0.0)"},{"location":"wxd-revisions/#may-25th-2023-100","text":"Initial publication.","title":"May 25th, 2023 (1.0.0)"},{"location":"wxd-startwatsonx/","text":"Lab Instructions URL Conventions Your TechZone reservation contains a number of URLs for the services provided in the watsonx.data server. The URL will contain the name of the server and the corresponding port number for the service. Throughout the documentation, the server name will be referred to as region.techzone-server.com and port number is referred to as port . Where you see these URLS, replace them with the values found in your reservation. Commands Throughout the labs, any command that needs to be executed will be highlighted in a grey box: cd /root/ibm-lh-dev/bin A copy icon is usually found on the far right-hand side of the command box. Use this to copy the text and paste it into your command window. You can also select the text and copy it that way. Note that some commands may span multiple lines, so make sure you copy everything in the box. System Check Your TechZone reservation will include the server name and port number to use when connecting using ssh. The port number is referred to as port in the command below, while the server will be referred to as region.techzone-server.com . Replace these values with those found in your reservation. Make sure that you have an open terminal session and use the following command to connect to the watsonx.data server. ssh -p port watsonx@region.techzone-server.com Password is watsonx.data . Next switch to the root userid. sudo su - Switch to the development code bin directory. cd /root/ibm-lh-dev/bin Once you have switched to the development directory, you can start running watsonx.data commands. You can check the status with the following command. ./status.sh --all Output will look similar to: using /root/ibm-lh-dev/localstorage/volumes as data root directory for user: root/1001 infra config location is /root/ibm-lh-dev/localstorage/volumes/infra lhconsole-ui running 0.0.0.0:9443->8443/tcp, :::9443->8443/tcp lhconsole-nodeclient-svc running 3001/tcp lhconsole-javaapi-svc running 8090/tcp lhconsole-api running 3333/tcp, 8081/tcp ibm-lh-presto running 0.0.0.0:8443->8443/tcp, :::8443->8443/tcp ibm-lh-hive-metastore running ibm-lh-postgres running 5432/tcp ibm-lh-minio running To confirm that the software is working, run the following commands to validate the installation. Presto Engine Test Check the Presto engine by connecting to a schema. First, we need to make sure that the Presto engine has completed all startup tasks. The following command is not part of watsonx.data, but has been included to simplify checking the status of the Presto service. check_presto Waiting for Presto to start. ........................... Ready Note : If the starting message may take up to 5 minutes when the system first initializes. Once the command returns \"Ready\" you can connect to the presto CLI. ./presto-cli --catalog tpch --schema tiny Check the record count of the customer table. Note : If the Presto engine has not yet started (you didn't run the check_presto script), the next command may result in a useless Java error message. You may need to wait for a minute for attempting to run the statement again. select * from customer limit 10; All Presto commands end with a semi-colon. The result set should include the a number of rows (the results will be random). custkey | name | address | nationkey | phone | acctbal | mktsegment | comment ---------+--------------------+---------------------------------------+-----------+-----------------+---------+------------+------------------------------------------------------------------------------------------------------------------- 1 | Customer#000000001 | IVhzIApeRb ot,c,E | 15 | 25-989-741-2988 | 711.56 | BUILDING | to the even, regular platelets. regular, ironic epitaphs nag e 2 | Customer#000000002 | XSTf4,NCwDVaWNe6tEgvwfmRchLXak | 13 | 23-768-687-3665 | 121.65 | AUTOMOBILE | l accounts. blithely ironic theodolites integrate boldly: caref 3 | Customer#000000003 | MG9kdTD2WBHm | 1 | 11-719-748-3364 | 7498.12 | AUTOMOBILE | deposits eat slyly ironic, even instructions. express foxes detect slyly. blithely even accounts abov 4 | Customer#000000004 | XxVSJsLAGtn | 4 | 14-128-190-5944 | 2866.83 | MACHINERY | requests. final, regular ideas sleep final accou 5 | Customer#000000005 | KvpyuHCplrB84WgAiGV6sYpZq7Tj | 3 | 13-750-942-6364 | 794.47 | HOUSEHOLD | n accounts will have to unwind. foxes cajole accor 6 | Customer#000000006 | sKZz0CsnMD7mp4Xd0YrBvx,LREYKUWAh yVn | 20 | 30-114-968-4951 | 7638.57 | AUTOMOBILE | tions. even deposits boost according to the slyly bold packages. final accounts cajole requests. furious 7 | Customer#000000007 | TcGe5gaZNgVePxU5kRrvXBfkasDTea | 18 | 28-190-982-9759 | 9561.95 | AUTOMOBILE | ainst the ironic, express theodolites. express, even pinto beans among the exp 8 | Customer#000000008 | I0B10bB0AymmC, 0PrRYBCP1yGJ8xcBPmWhl5 | 17 | 27-147-574-9335 | 6819.74 | BUILDING | among the slyly regular theodolites kindle blithely courts. carefully even theodolites haggle slyly along the ide 9 | Customer#000000009 | xKiAFTjUsCuxfeleNqefumTrjS | 8 | 18-338-906-3675 | 8324.07 | FURNITURE | r theodolites according to the requests wake thinly excuses: pending requests haggle furiousl 10 | Customer#000000010 | 6LrEaV6KR6PLVcgl2ArL Q3rqzLzcT1 v2 | 5 | 15-741-346-9870 | 2753.54 | HOUSEHOLD | es regular deposits haggle. fur (10 rows) The output on your screen will look similar to the following: The arrows on the far right side indicate that there is more output to view. Press the right and left arrows on your keyboard to scroll the display. If the result set is small, all of the results will display on the screen and no scrolling will be available unless the results are wider than the screen size. When thje display shows (END) you have reached the bottom of the output. If the display shows a colon ( : ) at the bottom of the screen, you can use the up and down arrow keys to scroll a record at a time, or the Page Up and Page Down keys to scroll a page at a time. To quit viewing the output, press the Q key. Quit the Presto CLI. The Presto quit command can be used with or without a semicolon. quit; Congratulations, your system is now up and running!","title":"Lab Instructions"},{"location":"wxd-startwatsonx/#lab-instructions","text":"","title":"Lab Instructions"},{"location":"wxd-startwatsonx/#url-conventions","text":"Your TechZone reservation contains a number of URLs for the services provided in the watsonx.data server. The URL will contain the name of the server and the corresponding port number for the service. Throughout the documentation, the server name will be referred to as region.techzone-server.com and port number is referred to as port . Where you see these URLS, replace them with the values found in your reservation.","title":"URL Conventions"},{"location":"wxd-startwatsonx/#commands","text":"Throughout the labs, any command that needs to be executed will be highlighted in a grey box: cd /root/ibm-lh-dev/bin A copy icon is usually found on the far right-hand side of the command box. Use this to copy the text and paste it into your command window. You can also select the text and copy it that way. Note that some commands may span multiple lines, so make sure you copy everything in the box.","title":"Commands"},{"location":"wxd-startwatsonx/#system-check","text":"Your TechZone reservation will include the server name and port number to use when connecting using ssh. The port number is referred to as port in the command below, while the server will be referred to as region.techzone-server.com . Replace these values with those found in your reservation. Make sure that you have an open terminal session and use the following command to connect to the watsonx.data server. ssh -p port watsonx@region.techzone-server.com Password is watsonx.data . Next switch to the root userid. sudo su - Switch to the development code bin directory. cd /root/ibm-lh-dev/bin Once you have switched to the development directory, you can start running watsonx.data commands. You can check the status with the following command. ./status.sh --all Output will look similar to: using /root/ibm-lh-dev/localstorage/volumes as data root directory for user: root/1001 infra config location is /root/ibm-lh-dev/localstorage/volumes/infra lhconsole-ui running 0.0.0.0:9443->8443/tcp, :::9443->8443/tcp lhconsole-nodeclient-svc running 3001/tcp lhconsole-javaapi-svc running 8090/tcp lhconsole-api running 3333/tcp, 8081/tcp ibm-lh-presto running 0.0.0.0:8443->8443/tcp, :::8443->8443/tcp ibm-lh-hive-metastore running ibm-lh-postgres running 5432/tcp ibm-lh-minio running To confirm that the software is working, run the following commands to validate the installation.","title":"System Check"},{"location":"wxd-startwatsonx/#presto-engine-test","text":"Check the Presto engine by connecting to a schema. First, we need to make sure that the Presto engine has completed all startup tasks. The following command is not part of watsonx.data, but has been included to simplify checking the status of the Presto service. check_presto Waiting for Presto to start. ........................... Ready Note : If the starting message may take up to 5 minutes when the system first initializes. Once the command returns \"Ready\" you can connect to the presto CLI. ./presto-cli --catalog tpch --schema tiny Check the record count of the customer table. Note : If the Presto engine has not yet started (you didn't run the check_presto script), the next command may result in a useless Java error message. You may need to wait for a minute for attempting to run the statement again. select * from customer limit 10; All Presto commands end with a semi-colon. The result set should include the a number of rows (the results will be random). custkey | name | address | nationkey | phone | acctbal | mktsegment | comment ---------+--------------------+---------------------------------------+-----------+-----------------+---------+------------+------------------------------------------------------------------------------------------------------------------- 1 | Customer#000000001 | IVhzIApeRb ot,c,E | 15 | 25-989-741-2988 | 711.56 | BUILDING | to the even, regular platelets. regular, ironic epitaphs nag e 2 | Customer#000000002 | XSTf4,NCwDVaWNe6tEgvwfmRchLXak | 13 | 23-768-687-3665 | 121.65 | AUTOMOBILE | l accounts. blithely ironic theodolites integrate boldly: caref 3 | Customer#000000003 | MG9kdTD2WBHm | 1 | 11-719-748-3364 | 7498.12 | AUTOMOBILE | deposits eat slyly ironic, even instructions. express foxes detect slyly. blithely even accounts abov 4 | Customer#000000004 | XxVSJsLAGtn | 4 | 14-128-190-5944 | 2866.83 | MACHINERY | requests. final, regular ideas sleep final accou 5 | Customer#000000005 | KvpyuHCplrB84WgAiGV6sYpZq7Tj | 3 | 13-750-942-6364 | 794.47 | HOUSEHOLD | n accounts will have to unwind. foxes cajole accor 6 | Customer#000000006 | sKZz0CsnMD7mp4Xd0YrBvx,LREYKUWAh yVn | 20 | 30-114-968-4951 | 7638.57 | AUTOMOBILE | tions. even deposits boost according to the slyly bold packages. final accounts cajole requests. furious 7 | Customer#000000007 | TcGe5gaZNgVePxU5kRrvXBfkasDTea | 18 | 28-190-982-9759 | 9561.95 | AUTOMOBILE | ainst the ironic, express theodolites. express, even pinto beans among the exp 8 | Customer#000000008 | I0B10bB0AymmC, 0PrRYBCP1yGJ8xcBPmWhl5 | 17 | 27-147-574-9335 | 6819.74 | BUILDING | among the slyly regular theodolites kindle blithely courts. carefully even theodolites haggle slyly along the ide 9 | Customer#000000009 | xKiAFTjUsCuxfeleNqefumTrjS | 8 | 18-338-906-3675 | 8324.07 | FURNITURE | r theodolites according to the requests wake thinly excuses: pending requests haggle furiousl 10 | Customer#000000010 | 6LrEaV6KR6PLVcgl2ArL Q3rqzLzcT1 v2 | 5 | 15-741-346-9870 | 2753.54 | HOUSEHOLD | es regular deposits haggle. fur (10 rows) The output on your screen will look similar to the following: The arrows on the far right side indicate that there is more output to view. Press the right and left arrows on your keyboard to scroll the display. If the result set is small, all of the results will display on the screen and no scrolling will be available unless the results are wider than the screen size. When thje display shows (END) you have reached the bottom of the output. If the display shows a colon ( : ) at the bottom of the screen, you can use the up and down arrow keys to scroll a record at a time, or the Page Up and Page Down keys to scroll a page at a time. To quit viewing the output, press the Q key. Quit the Presto CLI. The Presto quit command can be used with or without a semicolon. quit; Congratulations, your system is now up and running!","title":"Presto Engine Test"},{"location":"wxd-superset/","text":"Reporting/Dashboarding using Apache Superset Apache Superset is not a part of watsonx.data and is only used to demonstrate the capability to connect to watsonx.data from other BI/Reporting tools. You will need to install Apache Superset as part of this lab. The Superset repository needs to be in sync with the image being downloaded, so these libraries cannot be preloaded into this development image. Open a terminal window and connect via SSH as the watsonx user. Do not connect as the root user. Clone the Apache Superset repository with the git command. This command typically takes less than 1 minute to download the code. git clone https://github.com/apache/superset.git The docker-compose-non-dev.yml file needs to be updated so that Apache Superset can access the same network that watsonx.data is using. cd ./superset cp docker-compose-non-dev.yml docker-compose-non-dev-backup.yml sed '/version: \"3.7\"/q' docker-compose-non-dev.yml > yamlfix.txt cat <<EOF >> yamlfix.txt networks: default: external: True name: ibm-lh-network EOF sed -e '1,/version: \"3.7\"/ d' docker-compose-non-dev.yml >> yamlfix.txt We update the Apache Superset code to version 2.1.0 . sed 's/\\${TAG:-latest-dev}/2.1.0/' yamlfix.txt > docker-compose-non-dev.yml Use docker-compose to start Apache Superset. docker compose -f docker-compose-non-dev.yml up The docker compose command will download the necessary code for Apache Superset and start the service. The terminal session will contain the logging information for the service. The process is running in the foreground so you will see all the messages being produced by the program. If you want to stop the service at any time you will need to press CTRL-C. If you close this terminal window at any time, the process will stop. When you see \"Init Step 4/4\", the service is ready for connections. If you have already installed Apache Superset and you stopped it, there is no need to reinstall the program. Go back to the /home/watsonx/superset directory and run the docker compose program again: Once the service is running, open your browser and navigate to the URL and port that were provided in your TechZone reservation. The credentials for Apache Superset are userid admin , Password admin . \u2003 Setup a Database Connection to watsonx.data Open another terminal window for this next step. Once Apache Superset has started loading examples, you can issue the following command as watsonx or root . docker cp /certs/lh-ssl-ts.crt superset_app:/tmp/lh-ssl-ts.crt In the Apache Superset console, press the Settings button on the far right and select Database connections. Then select the [+ DATABASE] option on the far-right side of the panel. \u2003 A connection dialog will display. Select Presto as the database connection type. In the SQLALCHEMY URI field, enter the following information to connect to the hive_data catalog which contains the GOSALES, ONTIME, and TAXI data. presto://ibmlhadmin:password@ibm-lh-presto-svc:8443/hive_data Enter the following information to connect to the iceberg_data catalog which will contain any tables you created when running the examples in the lab. presto://ibmlhadmin:password@ibm-lh-presto-svc:8443/iceberg_data Select the Advanced tab. Copy the following information into the security box. {\"connect_args\":{\"protocol\":\"https\",\"requests_kwargs\":{\"verify\":\"/tmp/lh-ssl-ts.crt\"}}} Press the Connect button to create the connection. Create reports/charts/dashboards Once the connection has been tested and created for watsonx.data, we can click on Dataset and create a new dataset based on the customer table in the tiny schema. Reports/dashboards can then be created using the very intuitive Superset interface. Note : The Apache Superset team removes, inserts and updates charts on a frequent basis with no advance notification. The example you see below may not be exactly the same when you run the code. This is not something that we can control in the demonstration environment. Select Datasets at the top of the Apache Superset window. Press [+ DATASET]. In the Database field, select Presto. The schemas will take a few seconds to load. Select the workshop schema. Select customer from the list. The display will show the columns associated with this table. On the bottom right-hand corner is a button named CREATE DATASET AND CREATE CHART. Press that to display the following panel. To create a simple Bar Chart, we start by selecting the Bar Chart icon. If you click it once it displays information about the chart type. If you double-click it, the chart builder screen will display. Click on the mktsegment field and drag it into the DIMENSIONS field. Then drag the acctbal field into the METRICS field. The program will ask how the field is to be computed. Select AVG from the list and SAVE. Now press the CREATE CHART button found at the bottom of the screen. Try to create different charts/dashboards if you have time. Note : When you are finished using Apache Superset, press CTRL-C (Control-C) in the terminal window that you used to start it. This will stop the program and release the resources it is using. If you press CTRL-C twice, it immediately kills the program, but it may lose some of the work that you may have done.","title":"Apache Superset"},{"location":"wxd-superset/#reportingdashboarding-using-apache-superset","text":"Apache Superset is not a part of watsonx.data and is only used to demonstrate the capability to connect to watsonx.data from other BI/Reporting tools. You will need to install Apache Superset as part of this lab. The Superset repository needs to be in sync with the image being downloaded, so these libraries cannot be preloaded into this development image. Open a terminal window and connect via SSH as the watsonx user. Do not connect as the root user. Clone the Apache Superset repository with the git command. This command typically takes less than 1 minute to download the code. git clone https://github.com/apache/superset.git The docker-compose-non-dev.yml file needs to be updated so that Apache Superset can access the same network that watsonx.data is using. cd ./superset cp docker-compose-non-dev.yml docker-compose-non-dev-backup.yml sed '/version: \"3.7\"/q' docker-compose-non-dev.yml > yamlfix.txt cat <<EOF >> yamlfix.txt networks: default: external: True name: ibm-lh-network EOF sed -e '1,/version: \"3.7\"/ d' docker-compose-non-dev.yml >> yamlfix.txt We update the Apache Superset code to version 2.1.0 . sed 's/\\${TAG:-latest-dev}/2.1.0/' yamlfix.txt > docker-compose-non-dev.yml Use docker-compose to start Apache Superset. docker compose -f docker-compose-non-dev.yml up The docker compose command will download the necessary code for Apache Superset and start the service. The terminal session will contain the logging information for the service. The process is running in the foreground so you will see all the messages being produced by the program. If you want to stop the service at any time you will need to press CTRL-C. If you close this terminal window at any time, the process will stop. When you see \"Init Step 4/4\", the service is ready for connections. If you have already installed Apache Superset and you stopped it, there is no need to reinstall the program. Go back to the /home/watsonx/superset directory and run the docker compose program again: Once the service is running, open your browser and navigate to the URL and port that were provided in your TechZone reservation. The credentials for Apache Superset are userid admin , Password admin .","title":"Reporting/Dashboarding using Apache Superset"},{"location":"wxd-superset/#setup-a-database-connection-to-watsonxdata","text":"Open another terminal window for this next step. Once Apache Superset has started loading examples, you can issue the following command as watsonx or root . docker cp /certs/lh-ssl-ts.crt superset_app:/tmp/lh-ssl-ts.crt In the Apache Superset console, press the Settings button on the far right and select Database connections. Then select the [+ DATABASE] option on the far-right side of the panel. \u2003 A connection dialog will display. Select Presto as the database connection type. In the SQLALCHEMY URI field, enter the following information to connect to the hive_data catalog which contains the GOSALES, ONTIME, and TAXI data. presto://ibmlhadmin:password@ibm-lh-presto-svc:8443/hive_data Enter the following information to connect to the iceberg_data catalog which will contain any tables you created when running the examples in the lab. presto://ibmlhadmin:password@ibm-lh-presto-svc:8443/iceberg_data Select the Advanced tab. Copy the following information into the security box. {\"connect_args\":{\"protocol\":\"https\",\"requests_kwargs\":{\"verify\":\"/tmp/lh-ssl-ts.crt\"}}} Press the Connect button to create the connection.","title":"Setup a Database Connection to watsonx.data"},{"location":"wxd-superset/#create-reportschartsdashboards","text":"Once the connection has been tested and created for watsonx.data, we can click on Dataset and create a new dataset based on the customer table in the tiny schema. Reports/dashboards can then be created using the very intuitive Superset interface. Note : The Apache Superset team removes, inserts and updates charts on a frequent basis with no advance notification. The example you see below may not be exactly the same when you run the code. This is not something that we can control in the demonstration environment. Select Datasets at the top of the Apache Superset window. Press [+ DATASET]. In the Database field, select Presto. The schemas will take a few seconds to load. Select the workshop schema. Select customer from the list. The display will show the columns associated with this table. On the bottom right-hand corner is a button named CREATE DATASET AND CREATE CHART. Press that to display the following panel. To create a simple Bar Chart, we start by selecting the Bar Chart icon. If you click it once it displays information about the chart type. If you double-click it, the chart builder screen will display. Click on the mktsegment field and drag it into the DIMENSIONS field. Then drag the acctbal field into the METRICS field. The program will ask how the field is to be computed. Select AVG from the list and SAVE. Now press the CREATE CHART button found at the bottom of the screen. Try to create different charts/dashboards if you have time. Note : When you are finished using Apache Superset, press CTRL-C (Control-C) in the terminal window that you used to start it. This will stop the program and release the resources it is using. If you press CTRL-C twice, it immediately kills the program, but it may lose some of the work that you may have done.","title":"Create reports/charts/dashboards"},{"location":"wxd-systemconnector/","text":"Using Presto System Connector The Presto System connector provides information and metrics about the currently running Presto cluster. You can use this function to monitor the workloads on the Presto cluster using normal SQL queries. Make sure you are the root user and in the proper development directory. cd /root/ibm-lh-dev/bin Start the Presto CLI. ./presto-cli What queries are currently running? select * from \"system\".runtime.queries limit 5; query_id | state | user | source | query | resource_group_id | queued_time_ms | analysis_time_ms | created | started | last_heartbeat | end -----------------------------+----------+------------+------------------+-------------------------------------------------------------+-------------------+----------------+------------------+-------------------------+-------------------------+-------------------------+------------------------- 20230626_182942_00007_4suid | FINISHED | ibmlhadmin | presto-cli | show tables | [global] | 0 | 33 | 2023-06-26 18:29:40.628 | 2023-06-26 18:29:40.817 | 2023-06-26 18:29:41.095 | 2023-06-26 18:29:41.118 20230626_182938_00005_4suid | FINISHED | ibmlhadmin | presto-cli | SHOW FUNCTIONS | [global] | 1 | 607 | 2023-06-26 18:29:36.718 | 2023-06-26 18:29:36.777 | 2023-06-26 18:29:37.707 | 2023-06-26 18:29:37.742 20230626_192655_00031_4suid | FINISHED | ibmlhadmin | presto-cli | show schemas | [global] | 1 | 257 | 2023-06-26 19:26:53.739 | 2023-06-26 19:26:54.043 | 2023-06-26 19:26:54.845 | 2023-06-26 19:26:54.866 20230626_183851_00018_4suid | FINISHED | ibmlhadmin | nodejs-client | select * from system.runtime.queries order by query_id desc | [global] | 1 | 27 | 2023-06-26 18:38:49.169 | 2023-06-26 18:38:49.293 | 2023-06-26 18:38:50.084 | 2023-06-26 18:38:50.121 20230626_185405_00021_4suid | FINISHED | ibmlhadmin | presto-go-client | SHOW TABLES | [global] | 0 | 56 | 2023-06-26 18:54:03.542 | 2023-06-26 18:54:03.729 | 2023-06-26 18:54:04.042 | 2023-06-26 18:54:04.041 (5 rows) What tasks make up a query and where is the task running? select * from \"system\".runtime.tasks limit 5; node_id | task_id | stage_execution_id | stage_id | query_id | state | splits | queued_splits | running_splits | completed_splits | split_scheduled_time_ms | split_cpu_time_ms | split_blocked_time_ms | raw_input_bytes | raw_input_rows | processed_input_bytes | processed_input_rows | output_bytes | output_rows | physical_written_bytes | created | start | last_heartbeat | end --------------------------------------+-----------------------------------+---------------------------------+-------------------------------+-----------------------------+----------+--------+---------------+----------------+------------------+-------------------------+-------------------+-----------------------+-----------------+----------------+-----------------------+----------------------+--------------+-------------+------------------------+-------------------------+-------------------------+-------------------------+------------------------- 17ffe5e1-affe-4339-b618-0f60723cabf4 | 20230626_194106_00035_4suid.1.0.0 | 20230626_194106_00035_4suid.1.0 | 20230626_194106_00035_4suid.1 | 20230626_194106_00035_4suid | FINISHED | 1 | 0 | 0 | 1 | 14 | 2 | 0 | 5965 | 36 | 5965 | 36 | 7269 | 36 | 0 | 2023-06-26 19:41:04.606 | 2023-06-26 19:41:04.618 | 2023-06-26 19:41:04.639 | 2023-06-26 19:41:04.665 17ffe5e1-affe-4339-b618-0f60723cabf4 | 20230626_194309_00038_4suid.1.0.0 | 20230626_194309_00038_4suid.1.0 | 20230626_194309_00038_4suid.1 | 20230626_194309_00038_4suid | FINISHED | 1 | 0 | 0 | 1 | 15 | 2 | 0 | 6125 | 37 | 6125 | 37 | 866 | 5 | 0 | 2023-06-26 19:43:07.346 | 2023-06-26 19:43:07.357 | 2023-06-26 19:43:07.385 | 2023-06-26 19:43:07.398 17ffe5e1-affe-4339-b618-0f60723cabf4 | 20230626_194106_00035_4suid.0.0.0 | 20230626_194106_00035_4suid.0.0 | 20230626_194106_00035_4suid.0 | 20230626_194106_00035_4suid | FINISHED | 16 | 0 | 0 | 16 | 60 | 1 | 440 | 7096 | 36 | 7269 | 36 | 7269 | 36 | 0 | 2023-06-26 19:41:04.611 | 2023-06-26 19:41:04.626 | 2023-06-26 19:41:04.634 | 2023-06-26 19:41:04.682 17ffe5e1-affe-4339-b618-0f60723cabf4 | 20230626_194309_00038_4suid.0.0.0 | 20230626_194309_00038_4suid.0.0 | 20230626_194309_00038_4suid.0 | 20230626_194309_00038_4suid | FINISHED | 17 | 0 | 0 | 17 | 108 | 2 | 189 | 1100 | 5 | 866 | 5 | 866 | 5 | 0 | 2023-06-26 19:43:07.356 | 2023-06-26 19:43:07.380 | 2023-06-26 19:43:07.380 | 2023-06-26 19:43:07.419 17ffe5e1-affe-4339-b618-0f60723cabf4 | 20230626_194431_00039_4suid.1.0.0 | 20230626_194431_00039_4suid.1.0 | 20230626_194431_00039_4suid.1 | 20230626_194431_00039_4suid | RUNNING | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2023-06-26 19:44:29.346 | 2023-06-26 19:44:29.352 | 2023-06-26 19:44:29.353 | NULL (5 rows) Quit Presto. quit;","title":"System Connector"},{"location":"wxd-systemconnector/#using-presto-system-connector","text":"The Presto System connector provides information and metrics about the currently running Presto cluster. You can use this function to monitor the workloads on the Presto cluster using normal SQL queries. Make sure you are the root user and in the proper development directory. cd /root/ibm-lh-dev/bin Start the Presto CLI. ./presto-cli What queries are currently running? select * from \"system\".runtime.queries limit 5; query_id | state | user | source | query | resource_group_id | queued_time_ms | analysis_time_ms | created | started | last_heartbeat | end -----------------------------+----------+------------+------------------+-------------------------------------------------------------+-------------------+----------------+------------------+-------------------------+-------------------------+-------------------------+------------------------- 20230626_182942_00007_4suid | FINISHED | ibmlhadmin | presto-cli | show tables | [global] | 0 | 33 | 2023-06-26 18:29:40.628 | 2023-06-26 18:29:40.817 | 2023-06-26 18:29:41.095 | 2023-06-26 18:29:41.118 20230626_182938_00005_4suid | FINISHED | ibmlhadmin | presto-cli | SHOW FUNCTIONS | [global] | 1 | 607 | 2023-06-26 18:29:36.718 | 2023-06-26 18:29:36.777 | 2023-06-26 18:29:37.707 | 2023-06-26 18:29:37.742 20230626_192655_00031_4suid | FINISHED | ibmlhadmin | presto-cli | show schemas | [global] | 1 | 257 | 2023-06-26 19:26:53.739 | 2023-06-26 19:26:54.043 | 2023-06-26 19:26:54.845 | 2023-06-26 19:26:54.866 20230626_183851_00018_4suid | FINISHED | ibmlhadmin | nodejs-client | select * from system.runtime.queries order by query_id desc | [global] | 1 | 27 | 2023-06-26 18:38:49.169 | 2023-06-26 18:38:49.293 | 2023-06-26 18:38:50.084 | 2023-06-26 18:38:50.121 20230626_185405_00021_4suid | FINISHED | ibmlhadmin | presto-go-client | SHOW TABLES | [global] | 0 | 56 | 2023-06-26 18:54:03.542 | 2023-06-26 18:54:03.729 | 2023-06-26 18:54:04.042 | 2023-06-26 18:54:04.041 (5 rows) What tasks make up a query and where is the task running? select * from \"system\".runtime.tasks limit 5; node_id | task_id | stage_execution_id | stage_id | query_id | state | splits | queued_splits | running_splits | completed_splits | split_scheduled_time_ms | split_cpu_time_ms | split_blocked_time_ms | raw_input_bytes | raw_input_rows | processed_input_bytes | processed_input_rows | output_bytes | output_rows | physical_written_bytes | created | start | last_heartbeat | end --------------------------------------+-----------------------------------+---------------------------------+-------------------------------+-----------------------------+----------+--------+---------------+----------------+------------------+-------------------------+-------------------+-----------------------+-----------------+----------------+-----------------------+----------------------+--------------+-------------+------------------------+-------------------------+-------------------------+-------------------------+------------------------- 17ffe5e1-affe-4339-b618-0f60723cabf4 | 20230626_194106_00035_4suid.1.0.0 | 20230626_194106_00035_4suid.1.0 | 20230626_194106_00035_4suid.1 | 20230626_194106_00035_4suid | FINISHED | 1 | 0 | 0 | 1 | 14 | 2 | 0 | 5965 | 36 | 5965 | 36 | 7269 | 36 | 0 | 2023-06-26 19:41:04.606 | 2023-06-26 19:41:04.618 | 2023-06-26 19:41:04.639 | 2023-06-26 19:41:04.665 17ffe5e1-affe-4339-b618-0f60723cabf4 | 20230626_194309_00038_4suid.1.0.0 | 20230626_194309_00038_4suid.1.0 | 20230626_194309_00038_4suid.1 | 20230626_194309_00038_4suid | FINISHED | 1 | 0 | 0 | 1 | 15 | 2 | 0 | 6125 | 37 | 6125 | 37 | 866 | 5 | 0 | 2023-06-26 19:43:07.346 | 2023-06-26 19:43:07.357 | 2023-06-26 19:43:07.385 | 2023-06-26 19:43:07.398 17ffe5e1-affe-4339-b618-0f60723cabf4 | 20230626_194106_00035_4suid.0.0.0 | 20230626_194106_00035_4suid.0.0 | 20230626_194106_00035_4suid.0 | 20230626_194106_00035_4suid | FINISHED | 16 | 0 | 0 | 16 | 60 | 1 | 440 | 7096 | 36 | 7269 | 36 | 7269 | 36 | 0 | 2023-06-26 19:41:04.611 | 2023-06-26 19:41:04.626 | 2023-06-26 19:41:04.634 | 2023-06-26 19:41:04.682 17ffe5e1-affe-4339-b618-0f60723cabf4 | 20230626_194309_00038_4suid.0.0.0 | 20230626_194309_00038_4suid.0.0 | 20230626_194309_00038_4suid.0 | 20230626_194309_00038_4suid | FINISHED | 17 | 0 | 0 | 17 | 108 | 2 | 189 | 1100 | 5 | 866 | 5 | 866 | 5 | 0 | 2023-06-26 19:43:07.356 | 2023-06-26 19:43:07.380 | 2023-06-26 19:43:07.380 | 2023-06-26 19:43:07.419 17ffe5e1-affe-4339-b618-0f60723cabf4 | 20230626_194431_00039_4suid.1.0.0 | 20230626_194431_00039_4suid.1.0 | 20230626_194431_00039_4suid.1 | 20230626_194431_00039_4suid | RUNNING | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2023-06-26 19:44:29.346 | 2023-06-26 19:44:29.352 | 2023-06-26 19:44:29.353 | NULL (5 rows) Quit Presto. quit;","title":"Using Presto System Connector"},{"location":"wxd-timetravel/","text":"Time Travel Time travel allows you change the view of the data to a previous time. This is not the same as an AS OF query commonly used in SQL. The data is rolled back to a prior time. Let us look at the snapshots available for the customer table in the workshop schema. We currently have just 1 snapshot. First make sure you are in the proper directory. cd /root/ibm-lh-dev/bin Connect to Presto using the workshop schema. ./presto-cli --catalog iceberg_data --schema workshop Check current snapshots \u2013 STARTING STATE. SELECT * FROM iceberg_data.workshop.\"customer$snapshots\" ORDER BY committed_at; committed_at | snapshot_id | parent_id | operation | manifest_list | summary -----------------------------+---------------------+-----------+-----------+------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 2023-06-05 18:30:12.994 UTC | 6243511110201494487 | NULL | append | s3a://iceberg-bucket/customer/metadata/snap-6243511110201494487-1-b5ab84dc-671a-426a-a734-940baa49a11f.avro | {changed-partition-count=1, added-data-files=1, total-equality-deletes=0, added-records=1500, total-position-deletes=0, added-files-size=75240, total-delete-files=0, total-files-size=75240, total-records=1500, total-data-files=1} (1 row) Capture the first snapshot ID returned by the SQL statement. You will need this value when you run the rollback command. SELECT snapshot_id FROM iceberg_data.workshop.\"customer$snapshots\" ORDER BY committed_at; snapshot_id --------------------- 6243511110201494487 (1 row) Remember that number that was returned with the query above. Insert the following record to change the customer table in the workshop schema. insert into customer values(1501,'Deepak','IBM SVL',16,'123-212-3455', 123,'AUTOMOBILE','Testing snapshots'); \u2003 Let us look at the snapshots available for the customer table in the workshop schema. You should have 2 snapshots. SELECT * FROM iceberg_data.workshop.\"customer$snapshots\" ORDER BY committed_at; committed_at | snapshot_id | parent_id | operation | manifest_list | summary -----------------------------+---------------------+---------------------+-----------+------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 2023-06-05 18:30:12.994 UTC | 6243511110201494487 | NULL | append | s3a://iceberg-bucket/customer/metadata/snap-6243511110201494487-1-b5ab84dc-671a-426a-a734-940baa49a11f.avro | {changed-partition-count=1, added-data-files=1, total-equality-deletes=0, added-records=1500, total-position-deletes=0, added-files-size=75240, total-delete-files=0, total-files-size=75240, total-records=1500, total-data-files=1} 2023-06-05 18:52:49.193 UTC | 7110570704088319509 | 6243511110201494487 | append | s3a://iceberg-bucket/customer/metadata/snap-7110570704088319509-1-ef26bcf1-c122-4ea4-86b7-ba26369be374.avro | {changed-partition-count=1, added-data-files=1, total-equality-deletes=0, added-records=1, total-position-deletes=0, added-files-size=1268, total-delete-files=0, total-files-size=76508, total-records=1501, total-data-files=2} (2 rows) Querying the customer table in the workshop schema, we can see the record inserted with name=\u2019Deepak\u2019. select * from customer where name='Deepak'; custkey | name | address | nationkey | phone | acctbal | mktsegment | comment ---------+--------+---------+-----------+--------------+---------+------------+------------------- 1501 | Deepak | IBM SVL | 16 | 123-212-3455 | 123.0 | AUTOMOBILE | Testing snapshots (1 row) We realize that we don\u2019t want the recent updates or just want to see what the data was at any point in time to respond to regulatory requirements. We will leverage the out-of-box system function rollback_to_snapshot to rollback to an older snapshot. The syntax for this function is: CALL iceberg_data.system.rollback_to_snapshot('workshop','customer',x); The \"x\" would get replaced with the snapshot_id number that was found in the earlier query. It will be different on your system than the examples above. Copy the next code segment into Presto. CALL iceberg_data.system.rollback_to_snapshot('workshop','customer', You will see output similar to the following: CALL iceberg_data.system.rollback_to_snapshot('workshop','customer', -> At this point you will need to copy and paste your snapshot_id into the Presto command line and press return or enter. You will see following: CALL iceberg_data.system.rollback_to_snapshot('workshop','customer', -> 7230522396120575591 7230522396120575591 Now you will need to terminate the command with a ); to see the final result. ); CALL iceberg_data.system.rollback_to_snapshot('workshop','customer', -> 7230522396120575591 7230522396120575591 -> ); ); CALL Querying the customer table in the workshop schema, we cannot see the record inserted with name=\u2019Deepak\u2019. select * from customer where name='Deepak'; custkey | name | address | nationkey | phone | acctbal | mktsegment | comment ---------+--------+---------+-----------+--------------+---------+------------+------------------- (0 rows) Quit Presto. quit;","title":"Time Travel"},{"location":"wxd-timetravel/#time-travel","text":"Time travel allows you change the view of the data to a previous time. This is not the same as an AS OF query commonly used in SQL. The data is rolled back to a prior time. Let us look at the snapshots available for the customer table in the workshop schema. We currently have just 1 snapshot. First make sure you are in the proper directory. cd /root/ibm-lh-dev/bin Connect to Presto using the workshop schema. ./presto-cli --catalog iceberg_data --schema workshop Check current snapshots \u2013 STARTING STATE. SELECT * FROM iceberg_data.workshop.\"customer$snapshots\" ORDER BY committed_at; committed_at | snapshot_id | parent_id | operation | manifest_list | summary -----------------------------+---------------------+-----------+-----------+------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 2023-06-05 18:30:12.994 UTC | 6243511110201494487 | NULL | append | s3a://iceberg-bucket/customer/metadata/snap-6243511110201494487-1-b5ab84dc-671a-426a-a734-940baa49a11f.avro | {changed-partition-count=1, added-data-files=1, total-equality-deletes=0, added-records=1500, total-position-deletes=0, added-files-size=75240, total-delete-files=0, total-files-size=75240, total-records=1500, total-data-files=1} (1 row) Capture the first snapshot ID returned by the SQL statement. You will need this value when you run the rollback command. SELECT snapshot_id FROM iceberg_data.workshop.\"customer$snapshots\" ORDER BY committed_at; snapshot_id --------------------- 6243511110201494487 (1 row) Remember that number that was returned with the query above. Insert the following record to change the customer table in the workshop schema. insert into customer values(1501,'Deepak','IBM SVL',16,'123-212-3455', 123,'AUTOMOBILE','Testing snapshots'); \u2003 Let us look at the snapshots available for the customer table in the workshop schema. You should have 2 snapshots. SELECT * FROM iceberg_data.workshop.\"customer$snapshots\" ORDER BY committed_at; committed_at | snapshot_id | parent_id | operation | manifest_list | summary -----------------------------+---------------------+---------------------+-----------+------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 2023-06-05 18:30:12.994 UTC | 6243511110201494487 | NULL | append | s3a://iceberg-bucket/customer/metadata/snap-6243511110201494487-1-b5ab84dc-671a-426a-a734-940baa49a11f.avro | {changed-partition-count=1, added-data-files=1, total-equality-deletes=0, added-records=1500, total-position-deletes=0, added-files-size=75240, total-delete-files=0, total-files-size=75240, total-records=1500, total-data-files=1} 2023-06-05 18:52:49.193 UTC | 7110570704088319509 | 6243511110201494487 | append | s3a://iceberg-bucket/customer/metadata/snap-7110570704088319509-1-ef26bcf1-c122-4ea4-86b7-ba26369be374.avro | {changed-partition-count=1, added-data-files=1, total-equality-deletes=0, added-records=1, total-position-deletes=0, added-files-size=1268, total-delete-files=0, total-files-size=76508, total-records=1501, total-data-files=2} (2 rows) Querying the customer table in the workshop schema, we can see the record inserted with name=\u2019Deepak\u2019. select * from customer where name='Deepak'; custkey | name | address | nationkey | phone | acctbal | mktsegment | comment ---------+--------+---------+-----------+--------------+---------+------------+------------------- 1501 | Deepak | IBM SVL | 16 | 123-212-3455 | 123.0 | AUTOMOBILE | Testing snapshots (1 row) We realize that we don\u2019t want the recent updates or just want to see what the data was at any point in time to respond to regulatory requirements. We will leverage the out-of-box system function rollback_to_snapshot to rollback to an older snapshot. The syntax for this function is: CALL iceberg_data.system.rollback_to_snapshot('workshop','customer',x); The \"x\" would get replaced with the snapshot_id number that was found in the earlier query. It will be different on your system than the examples above. Copy the next code segment into Presto. CALL iceberg_data.system.rollback_to_snapshot('workshop','customer', You will see output similar to the following: CALL iceberg_data.system.rollback_to_snapshot('workshop','customer', -> At this point you will need to copy and paste your snapshot_id into the Presto command line and press return or enter. You will see following: CALL iceberg_data.system.rollback_to_snapshot('workshop','customer', -> 7230522396120575591 7230522396120575591 Now you will need to terminate the command with a ); to see the final result. ); CALL iceberg_data.system.rollback_to_snapshot('workshop','customer', -> 7230522396120575591 7230522396120575591 -> ); ); CALL Querying the customer table in the workshop schema, we cannot see the record inserted with name=\u2019Deepak\u2019. select * from customer where name='Deepak'; custkey | name | address | nationkey | phone | acctbal | mktsegment | comment ---------+--------+---------+-----------+--------------+---------+------------+------------------- (0 rows) Quit Presto. quit;","title":"Time Travel"},{"location":"wxd-troubleshooting/","text":"Troubleshooting watsonx.data Although we have tried to make the lab as error-free as possible, occasionally things will go wrong. Here is a list of common questions, problems, and potential solutions. What are the passwords for the services I Can't Open up a Terminal Window with VNC or Guacamole A SQL Statement failed but there are no error messages Apache Superset isn't Starting Apache Superset screens differ from the lab Too many incorrect logins using VNC and now I'm blocked Presto doesn't appear to be working Displaying Db2 Schema is failing Queries are failing with a 400 code Queries are failing with a 200 or 500 code Queries are failing with memory errors SSH, VNC and watsonx.data UI are not working No access to Presto/Minio UI after restart Firefox and Chrome freeze when connecting to MinIO What are the passwords for the services? See the section on Passwords . You can get all passwords for the system when you are logged in as the watsonx user by using the following command. cat /certs/passwords You can also use the Jupyter notebook link to display the userids and passwords for the services. I Can't Open up a Terminal Window with VNC or Guacamole First thing to remember is that you can't use VNC and the TechZone VM Remote Console (Guacamole) interface at the same time. Only one can be active at a time. If you can't use terminal windows in VNC If you find that the terminal icons \"spins\" inside the VNC window, this is caused by attempting to connect to the virtual machine by using the VM Remote Console button in your reservation details screen. To fix this problem, you must log out of the VNC session (top right corner of the Linux desktop - press the power button and choose logout). Once VNC logs back in you will be able use the terminal window. A SQL Statement failed, but there are no error messages You need to use the Presto console and search for the SQL statement. Click on the Query ID to find more details of the statement execution and scroll to the bottom of the web page to see any error details. Apache Superset isn't Starting If Superset doesn't start for some reason, you will need to reset it completely to try it again. First make sure you are connected as the watsonx user not root . Make sure you have stopped the terminal session that is running Apache Superset. Next remove the Apache Superset directory. sudo rm -rf /home/watsonx/superset We remove the docker images associated with Apache Superset. If no containers or volumes exist you will get an error message. docker ps -a -q --filter \"name=superset\" | xargs docker container rm --force docker volume list -q --filter \"name=superset\" | xargs docker volume rm --force Download the superset code again. git clone https://github.com/apache/superset.git The docker-compose-non-dev.yml file needs to be updated so that Apache Superset can access the same network that watsonx.data is using. cd ./superset cp docker-compose-non-dev.yml docker-compose-non-dev-backup.yml sed '/version: \"3.7\"/q' docker-compose-non-dev.yml > yamlfix.txt cat <<EOF >> yamlfix.txt networks: default: external: True name: ibm-lh-network EOF sed -e '1,/version: \"3.7\"/ d' docker-compose-non-dev.yml >> yamlfix.txt We update the Apache Superset code to version 2.1.0 . sed 's/\\${TAG:-latest-dev}/2.1.0/' yamlfix.txt > docker-compose-non-dev.yml Use docker-compose to start Apache Superset. nohup docker compose -f docker-compose-non-dev.yml up & The nohup command will issue a message indicating that output will be directed to the nohup.out file. It takes some time for the service to start, so be patient! You can view any output from the Apache Superset system by viewing the nohup.out file in the directory where you installed superset. Apache Superset screens differ from the lab The Apache Superset project makes frequent changes to the types of charts that are available. In some cases they remove or merge charts. Since these charts changes are dynamic, we are not able to guarantee that our examples will look the same as what you might have on your system. Presto doesn't appear to be working If you find that the watsonx.data UI is generating error messages that suggest that queries are not running, or that the Presto service is dead, you can force a soft restart of Presto with the following command: docker restart ibm-lh-presto This will restart the Presto server. If you find that does not fix your problem, you will need to do a hard reset using the following commands: sudo su - cd /root/ibm-lh-dev/bin ./stop_service ibm-lh-presto ./start_service ibm-lh-presto check_presto The command will wait until the service is running before exiting. Displaying Db2 Schema is failing Occasionally when attempting to expand the Db2 catalog (schema), the watsonx.data UI will not display any data or issue an error message. You can try refreshing the browser (not the refresh icon inside the UI) and try again. If you find that this is failing again, open the Query workspace and run the following SQL (replace db2_gosales with the name you cataloged the database with). select count(*) from db2_gosales.gosalesdw.go_org_dim The result should be 123 and hopefully the tables that are part of the schema will display for you. Queries are failing with a 400 code The watsonx.data UI will log you out after a period of inactivity, but doesn't tell you that this has happened. When you attempt to run a query, the error that is returned (400) indicates that you need to log back in again. Queries are failing with a 200 or 500 code A 500 code may indicate the watsonx.data UI has a problem connecting with the Presto engine. First log out of the console and trying logging back on. If that fails to solve the problem, you will need to reboot the console. Open up a terminal window into the server: As the root user, restart the docker container that is running the watsonx.data UI. docker restart lhconsole-nodeclient-svc Queries fail become of insufficient memory If you are running a complex query, you may get an error message similar to \"Query exceeded per-node user memory limit\" or a something similar. Watsonx.data (Presto) attempts to limit the amount of resources being using in a query and will stop a query if it exceeds a certain threshold. You can change the behavior of the system by making the following changes. Note : During this step you will disconnect anyone running a query on the server. What you need to do is make a change to the configuration settings of the Presto engine. AS the root user, enter the docker container for the presto engine: docker exec -it ibm-lh-presto /bin/bash Next, copy the original config file to a safe place in case we make an error: cp /opt/presto/etc/config.properties /opt/presto/etc/config.properties.backup Then update the properties file. cat >> /opt/presto/etc/config.properties << EOL experimental.spiller-spill-path=/tmp experimental.spiller-max-used-space-threshold=0.7 experimental.max-spill-per-node=10GB experimental.query-max-spill-per-node=10GB experimental.spill-enabled=true query.max-memory=10GB query.max-memory-per-node=10GB query.max-total-memory-per-node=10GB query.max-total-memory=10GB EOL Doublecheck that it worked. cat /opt/presto/etc/config.properties | grep experimental experimental.max-spill-per-node=10GB experimental.query-max-spill-per-node=10GB experimental.spill-enabled=true experimental.spiller-max-used-space-threshold=0.7 experimental.spiller-spill-path=/tmp If it is all good then exit the container. exit And now we restart the container. Make sure that you don't impact other users! docker restart ibm-lh-presto Now try running your query again. Note : Once you make this change, only restart presto using the above command, otherwise you will lose the changes. Too many incorrect logins using VNC and now I'm blocked from connecting If you lock yourself out of VNC because of too many incorrect logins, you can reset the service with the following commands. Connect as the root user then run the following command and you should be able to log in again. systemctl restart vncserver@:1 exit SSH, VNC and watsonx.data UI are not working Symptoms: You've tried to use SSH to log into the system, and you get a timeout error. All the Web-based UIs (watsonx.data UI, Presto) fail. Find your email message that contains details of your reservation. Details of what the reservations and the page containing details of the reservation can be found in the Accessing the reservation section. Once the details appear, scroll down to the bottom of the web page, and you will see the VM Remote Console button. You can access the logon screen of the virtual machine by pressing the VM Remote Console button. Clicking on this button will display the logon screen for the server. If you see this screen, the system is running and there is something wrong the watsonx.data service (see instructions below). If you see the following screen: This means your server has been turned off. Click on the Power on button. Make sure to press the Yes button to turn the power on! In a few minutes you should see the logon screen again. Wait for a few minutes for all the services to start, and then you will be able to use SSH, VNC, and watsonx.data UI. Reset watsonx.data If you can log into the watsonx userid using the VM Remove console, you can reset the watsonx.data server with the following steps. SSH into the server as the root user. Then switch to the development code bin directory. cd /root/ibm-lh-dev/bin Check the status of the system with the following command. ./status.sh --all Output will look similar to: using /root/ibm-lh-dev/localstorage/volumes as data root directory for user: root/1001 infra config location is /root/ibm-lh-dev/localstorage/volumes/infra lhconsole-ui running 0.0.0.0:9443->8443/tcp, :::9443->8443/tcp lhconsole-nodeclient-svc running 3001/tcp lhconsole-javaapi-svc running 8090/tcp lhconsole-api running 3333/tcp, 8081/tcp ibm-lh-presto running 0.0.0.0:8443->8443/tcp, :::8443->8443/tcp ibm-lh-hive-metastore running ibm-lh-postgres running 5432/tcp ibm-lh-minio running If the any of the services are not running, you will need to restart the system with the following set of commands. cd /root/ibm-lh-dev/bin ./stop.sh export LH_RUN_MODE=diag ./start.sh Wait for all services to start and then check to see if you can connect to the watsonx.data UI. No access to Presto/Minio UI after restart If you are using a TechZone image that has been suspended, or restarted, you may come across a situation where you are unable to connect to any service that uses the http protocol. The watsonx.service needs to have a diagnostic flag set that opens up these ports, and sometimes this diagnostic setting is not being updated. To manually stop and start the system, you will need to connect with root user privileges and run the following commands: sudo su - cd /root/ibm-lh-dev/bin ./stop.sh export LH_RUN_MODE=diag ./start.sh This set of commands will stop all the services in watsonx.data and restart them in diagnostic mode. This will now open the http ports for use. Firefox and Chrome freeze when connecting to MinIO Firefox and Chrome on OSX will occasionally freeze when connecting to the MinIO console. The Safari browser is much more reliable. This problem appears to be caused by some features which are not properly handled by these browsers.","title":"Troubleshooting"},{"location":"wxd-troubleshooting/#troubleshooting-watsonxdata","text":"Although we have tried to make the lab as error-free as possible, occasionally things will go wrong. Here is a list of common questions, problems, and potential solutions. What are the passwords for the services I Can't Open up a Terminal Window with VNC or Guacamole A SQL Statement failed but there are no error messages Apache Superset isn't Starting Apache Superset screens differ from the lab Too many incorrect logins using VNC and now I'm blocked Presto doesn't appear to be working Displaying Db2 Schema is failing Queries are failing with a 400 code Queries are failing with a 200 or 500 code Queries are failing with memory errors SSH, VNC and watsonx.data UI are not working No access to Presto/Minio UI after restart Firefox and Chrome freeze when connecting to MinIO","title":"Troubleshooting watsonx.data"},{"location":"wxd-troubleshooting/#what-are-the-passwords-for-the-services","text":"See the section on Passwords . You can get all passwords for the system when you are logged in as the watsonx user by using the following command. cat /certs/passwords You can also use the Jupyter notebook link to display the userids and passwords for the services.","title":"What are the passwords for the services?"},{"location":"wxd-troubleshooting/#i-cant-open-up-a-terminal-window-with-vnc-or-guacamole","text":"First thing to remember is that you can't use VNC and the TechZone VM Remote Console (Guacamole) interface at the same time. Only one can be active at a time.","title":"I Can't Open up a Terminal Window with VNC or Guacamole"},{"location":"wxd-troubleshooting/#if-you-cant-use-terminal-windows-in-vnc","text":"If you find that the terminal icons \"spins\" inside the VNC window, this is caused by attempting to connect to the virtual machine by using the VM Remote Console button in your reservation details screen. To fix this problem, you must log out of the VNC session (top right corner of the Linux desktop - press the power button and choose logout). Once VNC logs back in you will be able use the terminal window.","title":"If you can't use terminal windows in VNC"},{"location":"wxd-troubleshooting/#a-sql-statement-failed-but-there-are-no-error-messages","text":"You need to use the Presto console and search for the SQL statement. Click on the Query ID to find more details of the statement execution and scroll to the bottom of the web page to see any error details.","title":"A SQL Statement failed, but there are no error messages"},{"location":"wxd-troubleshooting/#apache-superset-isnt-starting","text":"If Superset doesn't start for some reason, you will need to reset it completely to try it again. First make sure you are connected as the watsonx user not root . Make sure you have stopped the terminal session that is running Apache Superset. Next remove the Apache Superset directory. sudo rm -rf /home/watsonx/superset We remove the docker images associated with Apache Superset. If no containers or volumes exist you will get an error message. docker ps -a -q --filter \"name=superset\" | xargs docker container rm --force docker volume list -q --filter \"name=superset\" | xargs docker volume rm --force Download the superset code again. git clone https://github.com/apache/superset.git The docker-compose-non-dev.yml file needs to be updated so that Apache Superset can access the same network that watsonx.data is using. cd ./superset cp docker-compose-non-dev.yml docker-compose-non-dev-backup.yml sed '/version: \"3.7\"/q' docker-compose-non-dev.yml > yamlfix.txt cat <<EOF >> yamlfix.txt networks: default: external: True name: ibm-lh-network EOF sed -e '1,/version: \"3.7\"/ d' docker-compose-non-dev.yml >> yamlfix.txt We update the Apache Superset code to version 2.1.0 . sed 's/\\${TAG:-latest-dev}/2.1.0/' yamlfix.txt > docker-compose-non-dev.yml Use docker-compose to start Apache Superset. nohup docker compose -f docker-compose-non-dev.yml up & The nohup command will issue a message indicating that output will be directed to the nohup.out file. It takes some time for the service to start, so be patient! You can view any output from the Apache Superset system by viewing the nohup.out file in the directory where you installed superset.","title":"Apache Superset isn't Starting"},{"location":"wxd-troubleshooting/#apache-superset-screens-differ-from-the-lab","text":"The Apache Superset project makes frequent changes to the types of charts that are available. In some cases they remove or merge charts. Since these charts changes are dynamic, we are not able to guarantee that our examples will look the same as what you might have on your system.","title":"Apache Superset screens differ from the lab"},{"location":"wxd-troubleshooting/#presto-doesnt-appear-to-be-working","text":"If you find that the watsonx.data UI is generating error messages that suggest that queries are not running, or that the Presto service is dead, you can force a soft restart of Presto with the following command: docker restart ibm-lh-presto This will restart the Presto server. If you find that does not fix your problem, you will need to do a hard reset using the following commands: sudo su - cd /root/ibm-lh-dev/bin ./stop_service ibm-lh-presto ./start_service ibm-lh-presto check_presto The command will wait until the service is running before exiting.","title":"Presto doesn't appear to be working"},{"location":"wxd-troubleshooting/#displaying-db2-schema-is-failing","text":"Occasionally when attempting to expand the Db2 catalog (schema), the watsonx.data UI will not display any data or issue an error message. You can try refreshing the browser (not the refresh icon inside the UI) and try again. If you find that this is failing again, open the Query workspace and run the following SQL (replace db2_gosales with the name you cataloged the database with). select count(*) from db2_gosales.gosalesdw.go_org_dim The result should be 123 and hopefully the tables that are part of the schema will display for you.","title":"Displaying Db2 Schema is failing"},{"location":"wxd-troubleshooting/#queries-are-failing-with-a-400-code","text":"The watsonx.data UI will log you out after a period of inactivity, but doesn't tell you that this has happened. When you attempt to run a query, the error that is returned (400) indicates that you need to log back in again.","title":"Queries are failing with a 400 code"},{"location":"wxd-troubleshooting/#queries-are-failing-with-a-200-or-500-code","text":"A 500 code may indicate the watsonx.data UI has a problem connecting with the Presto engine. First log out of the console and trying logging back on. If that fails to solve the problem, you will need to reboot the console. Open up a terminal window into the server: As the root user, restart the docker container that is running the watsonx.data UI. docker restart lhconsole-nodeclient-svc","title":"Queries are failing with a 200 or 500 code"},{"location":"wxd-troubleshooting/#queries-fail-become-of-insufficient-memory","text":"If you are running a complex query, you may get an error message similar to \"Query exceeded per-node user memory limit\" or a something similar. Watsonx.data (Presto) attempts to limit the amount of resources being using in a query and will stop a query if it exceeds a certain threshold. You can change the behavior of the system by making the following changes. Note : During this step you will disconnect anyone running a query on the server. What you need to do is make a change to the configuration settings of the Presto engine. AS the root user, enter the docker container for the presto engine: docker exec -it ibm-lh-presto /bin/bash Next, copy the original config file to a safe place in case we make an error: cp /opt/presto/etc/config.properties /opt/presto/etc/config.properties.backup Then update the properties file. cat >> /opt/presto/etc/config.properties << EOL experimental.spiller-spill-path=/tmp experimental.spiller-max-used-space-threshold=0.7 experimental.max-spill-per-node=10GB experimental.query-max-spill-per-node=10GB experimental.spill-enabled=true query.max-memory=10GB query.max-memory-per-node=10GB query.max-total-memory-per-node=10GB query.max-total-memory=10GB EOL Doublecheck that it worked. cat /opt/presto/etc/config.properties | grep experimental experimental.max-spill-per-node=10GB experimental.query-max-spill-per-node=10GB experimental.spill-enabled=true experimental.spiller-max-used-space-threshold=0.7 experimental.spiller-spill-path=/tmp If it is all good then exit the container. exit And now we restart the container. Make sure that you don't impact other users! docker restart ibm-lh-presto Now try running your query again. Note : Once you make this change, only restart presto using the above command, otherwise you will lose the changes.","title":"Queries fail become of insufficient memory"},{"location":"wxd-troubleshooting/#too-many-incorrect-logins-using-vnc-and-now-im-blocked-from-connecting","text":"If you lock yourself out of VNC because of too many incorrect logins, you can reset the service with the following commands. Connect as the root user then run the following command and you should be able to log in again. systemctl restart vncserver@:1 exit","title":"Too many incorrect logins using VNC and now I'm blocked from connecting"},{"location":"wxd-troubleshooting/#ssh-vnc-and-watsonxdata-ui-are-not-working","text":"Symptoms: You've tried to use SSH to log into the system, and you get a timeout error. All the Web-based UIs (watsonx.data UI, Presto) fail. Find your email message that contains details of your reservation. Details of what the reservations and the page containing details of the reservation can be found in the Accessing the reservation section. Once the details appear, scroll down to the bottom of the web page, and you will see the VM Remote Console button. You can access the logon screen of the virtual machine by pressing the VM Remote Console button. Clicking on this button will display the logon screen for the server. If you see this screen, the system is running and there is something wrong the watsonx.data service (see instructions below). If you see the following screen: This means your server has been turned off. Click on the Power on button. Make sure to press the Yes button to turn the power on! In a few minutes you should see the logon screen again. Wait for a few minutes for all the services to start, and then you will be able to use SSH, VNC, and watsonx.data UI.","title":"SSH, VNC and watsonx.data UI are not working"},{"location":"wxd-troubleshooting/#reset-watsonxdata","text":"If you can log into the watsonx userid using the VM Remove console, you can reset the watsonx.data server with the following steps. SSH into the server as the root user. Then switch to the development code bin directory. cd /root/ibm-lh-dev/bin Check the status of the system with the following command. ./status.sh --all Output will look similar to: using /root/ibm-lh-dev/localstorage/volumes as data root directory for user: root/1001 infra config location is /root/ibm-lh-dev/localstorage/volumes/infra lhconsole-ui running 0.0.0.0:9443->8443/tcp, :::9443->8443/tcp lhconsole-nodeclient-svc running 3001/tcp lhconsole-javaapi-svc running 8090/tcp lhconsole-api running 3333/tcp, 8081/tcp ibm-lh-presto running 0.0.0.0:8443->8443/tcp, :::8443->8443/tcp ibm-lh-hive-metastore running ibm-lh-postgres running 5432/tcp ibm-lh-minio running If the any of the services are not running, you will need to restart the system with the following set of commands. cd /root/ibm-lh-dev/bin ./stop.sh export LH_RUN_MODE=diag ./start.sh Wait for all services to start and then check to see if you can connect to the watsonx.data UI.","title":"Reset watsonx.data"},{"location":"wxd-troubleshooting/#no-access-to-prestominio-ui-after-restart","text":"If you are using a TechZone image that has been suspended, or restarted, you may come across a situation where you are unable to connect to any service that uses the http protocol. The watsonx.service needs to have a diagnostic flag set that opens up these ports, and sometimes this diagnostic setting is not being updated. To manually stop and start the system, you will need to connect with root user privileges and run the following commands: sudo su - cd /root/ibm-lh-dev/bin ./stop.sh export LH_RUN_MODE=diag ./start.sh This set of commands will stop all the services in watsonx.data and restart them in diagnostic mode. This will now open the http ports for use.","title":"No access to Presto/Minio UI after restart"},{"location":"wxd-troubleshooting/#firefox-and-chrome-freeze-when-connecting-to-minio","text":"Firefox and Chrome on OSX will occasionally freeze when connecting to the MinIO console. The Safari browser is much more reliable. This problem appears to be caused by some features which are not properly handled by these browsers.","title":"Firefox and Chrome freeze when connecting to MinIO"},{"location":"wxd-useradmin/","text":"Watsonx.data User Administration and Roles Security and access control within watsonx.data are based on roles. A role is a set of privileges that control the actions that users can perform. Authorization is granted by assigning a specific role to a user, or by adding the user to a group that has been assigned one or more roles. Access control at the infrastructural level allows permissions to be granted on the engines, catalogs, buckets, and databases. Roles for these components include Admin, Manager, User, Writer, and Reader (depending on the component). Access to the data itself is managed through data control policies. Policies can be created to permit or deny access to schemas, tables, and columns. User account management and access management varies between the different deployment options for watsonx.data. For instance, in the managed cloud service (SaaS), the service owner would need to invite other users to the environment and give them appropriate service access. With the standalone software, users can be added within the console\u2019s Access control page. In the Developer Edition, users can be added using a command line tool. Credits: Portions of text found in this section were copied from Kelly Schlamb's watsonx.data L3 course. User Administration This lab is using the Developer edition of the watsonx.data software, which means that the Access control panel is not available. In order to manage users, the user-mgmt command will need to be used. The user-mgmt command is found in the /root/ibm-lh-dev/bin directory. Examples of using the command are found below. Add a User The syntax for adding a user is: ./user-mgmt add-user <username> [ User | Admin ] <password> The values are: username - The name of the user [User|Admin] - The type of user. Note that the type of user is case-sensitive! password - The password for the user. The following command will add the user watsonx with a password of watsonx.data . This will be a standard user with no privileges. The first step is to make sure you are connected as the root user in watsonx.data server and have switched to the proper directory. sudo su - cd /root/ibm-lh-dev/bin The next command will add a new user to the system. ./user-mgmt add-user watsonx User watsonx.data Change a User's Password The syntax for changing a password is: ./user-mgmt change-password <username> This command will require that the user enter the new password. You can issue the command and provide the new password at the prompt. The other way to simulate the enter command is to use the Linux yes function which repeats a value multiple times. The following command will change the password of watsonx to hellowatson . yes hellowatson | ./user-mgmt change-password watsonx Validate a User's Password You can validate a password by using the following command: ./user-mgmt test-user-cred <username> The username is the name of the user that you want to check the password for. This command will require that the user enter the existing password to check it. You can use the yes function (as described above) to simulate the enter command. The following command will check that we have changed the password of watsonx to hellowatson . yes hellowatson | ./user-mgmt test-user-cred watsonx Delete a User To delete a user, use the following command: ./user-mgmt delete-user <username> The error messages on group ownership can be safely ignored. The following command will remove our watsonx user. ./user-mgmt delete-user watsonx Roles and Policies In this section you will add a new user and provide them with privileges over the infrastructure and data. First start by adding a new user to the watsonx.data system. If you haven't already, make sure you are connected to the server as the root user and are in the /root/ibm-lh-dev/bin directory. Add user1 to the system with a password of password . ./user-mgmt add-user user1 User password Access Control To view what users are currently authorized to use the system, select the Access control icon found on the left side of the watsonx.data UI. A list of objects that make up the watsonx.data infrastructure are displayed. You can see that the objects are made up of: Engines Catalogs Buckets In a real-world scenario where a user will be querying data from a table, that user will need to be given a minimum of User access to an engine (to be able to run the query), User access for the catalog associated with the data (to be able to see the schema information associated with the table), and Reader access to the bucket associated with the data (to be able to read the data from object storage). Additionally, a policy has to be created to permit the user to access the table in question. Granting Access Select the presto-01 engine (highlighted in red above) to view the current users that have access to the engine. At this point, only the administrative user (ibmlhadmin) can use the Presto engine. Click on the Add Access button to add a new authorized user to the list. The role button has been selected in the dialog to show the role options of Admin, Manager, or User. An Admin user can grant any role to a user, while a Manager can only grant User privileges. For this lab, grant user1 \"user\" privileges and then press Add. Close the dialog by clicking on the [x] on the top right of the screen. Now user1 needs to be granted access to a catalog. In this case, the iceberg_data and hive_data catalogs are required for the Presto engine and are implicitly granted to the user. Click on the iceberg_data line to confirm that this is the case. You should see that user1 has already been granted access to the catalog. If your version of watsonx.data Developer Edition does not have access granted to user1 , add it manually using the same steps as for engine access. Close the dialog to return to the Access control screen. The final step is to grant access to the underlying buckets. Because user was implicitly granted access to the iceberg_data and hive_data catalogs, the underlying buckets iceberg-bucket and hive-bucket were also added to their access list. Click on iceberg-bucket to view the access control. Again, if user1 does not have access to the bucket, add access using the same steps as adding engine access. When done viewing the screen, close the dialog to return to the Access control dialog. Policies After access has been granted to engines, catalogs, and buckets, the final step is to create a policy to grant access to tables. Click on the Policy tab to display the current policies in place (there should be none). The Add Policy button is highlighted on the far right side of the screen. Pressing the button will display the new Access Control Policy dialog. Fill in the following information: Policy name: selectflights Description: blank Policy status: active You can always activate a policy after you have created it. Click Next. Here we need to select which schemas that the user will be able to access. For this example, select the ontime schema. After selecting the schema, a set of tables associated with the schema will be listed. You can choose which tables can be searched. If you choose an individual table, you can restrict which columns can be searched. Select the ontime table and then select the following columns (you will need to scroll down the page): flightdate reporting_airline flight_number_reporting_airline originairportid destairportid Once you have selected the columns, press the Next button to display the Rules dialog. The Rules dialog allows you to fine-tune what actions can be done by a user against the data. Press the Add Rule button to display the Add Rule dialog. Rules can be used to Allow actions or to Deny actions. In our example, we want to allow user1 to SELECT data from the data, but with no other options. Note : In production versions of watsonx.data, you can provide access to a group which makes it simpler to create a set of rules that apply to a group and then add a user to a group. That way a user will inherit the rules that were applied to the group rather than having to create rules for that individual. The developer edition displays GROUP as an option, but it is not implemented. Once the fields are filled in, press the Add button. You can continue to add additional rules to the list. Since we only want the one rule, select the Review button. Once you have confirmed that the data objects and rules look correct, press the Save button. The selectflights policy is now in place and is actively enforced. Before testing the policy enforcement, use the SQL icon on the left side to navigate to the hive_data catalog and view the schemas that are associated with it. Expand the ontime schema to view the tables and the columns that are available in the ontime table. When you connect as user1 , you will be able to compare what objects can be accessed from that userid. Testing Policy Enforcement To test whether the rules are enforced, you will need to log out of the current watsonx.data UI session. At the top of the Access Control screen, you will see the user icon on the top right. Clicking on the icon will display the logout dialog. Log out to the main watsonx.data UI screen. You will be prompted to confirm log out. Once back to the main log in panel, enter user and password into the dialog. Your watsonx.data UI should now display user1 . Navigate to the SQL icon and then select hive_data -> ontime -> ontime . You should notice that user1 was restricted to seeing only the ontime schema in the hive_data catalog. In addition, the user was restricted to accessing one of the tables ( ontime ) and 5 columns. Attempting to run a SELECT statement against all the data will result in a policy error. Correcting the SQL to include only permitted columns results in an answer set. The policy rules have been enforced for user1 , preventing them from viewing any other schemas or tables in the system. In addition, the SQL that they could execute was restricted to specific columns in the table. Before moving onto any other sections, make sure to log out as user1 and reconnect as the ibmlhadmin user.","title":"User Administration and Roles"},{"location":"wxd-useradmin/#watsonxdata-user-administration-and-roles","text":"Security and access control within watsonx.data are based on roles. A role is a set of privileges that control the actions that users can perform. Authorization is granted by assigning a specific role to a user, or by adding the user to a group that has been assigned one or more roles. Access control at the infrastructural level allows permissions to be granted on the engines, catalogs, buckets, and databases. Roles for these components include Admin, Manager, User, Writer, and Reader (depending on the component). Access to the data itself is managed through data control policies. Policies can be created to permit or deny access to schemas, tables, and columns. User account management and access management varies between the different deployment options for watsonx.data. For instance, in the managed cloud service (SaaS), the service owner would need to invite other users to the environment and give them appropriate service access. With the standalone software, users can be added within the console\u2019s Access control page. In the Developer Edition, users can be added using a command line tool. Credits: Portions of text found in this section were copied from Kelly Schlamb's watsonx.data L3 course.","title":"Watsonx.data User Administration and Roles"},{"location":"wxd-useradmin/#user-administration","text":"This lab is using the Developer edition of the watsonx.data software, which means that the Access control panel is not available. In order to manage users, the user-mgmt command will need to be used. The user-mgmt command is found in the /root/ibm-lh-dev/bin directory. Examples of using the command are found below.","title":"User Administration"},{"location":"wxd-useradmin/#add-a-user","text":"The syntax for adding a user is: ./user-mgmt add-user <username> [ User | Admin ] <password> The values are: username - The name of the user [User|Admin] - The type of user. Note that the type of user is case-sensitive! password - The password for the user. The following command will add the user watsonx with a password of watsonx.data . This will be a standard user with no privileges. The first step is to make sure you are connected as the root user in watsonx.data server and have switched to the proper directory. sudo su - cd /root/ibm-lh-dev/bin The next command will add a new user to the system. ./user-mgmt add-user watsonx User watsonx.data","title":"Add a User"},{"location":"wxd-useradmin/#change-a-users-password","text":"The syntax for changing a password is: ./user-mgmt change-password <username> This command will require that the user enter the new password. You can issue the command and provide the new password at the prompt. The other way to simulate the enter command is to use the Linux yes function which repeats a value multiple times. The following command will change the password of watsonx to hellowatson . yes hellowatson | ./user-mgmt change-password watsonx","title":"Change a User's Password"},{"location":"wxd-useradmin/#validate-a-users-password","text":"You can validate a password by using the following command: ./user-mgmt test-user-cred <username> The username is the name of the user that you want to check the password for. This command will require that the user enter the existing password to check it. You can use the yes function (as described above) to simulate the enter command. The following command will check that we have changed the password of watsonx to hellowatson . yes hellowatson | ./user-mgmt test-user-cred watsonx","title":"Validate a User's Password"},{"location":"wxd-useradmin/#delete-a-user","text":"To delete a user, use the following command: ./user-mgmt delete-user <username> The error messages on group ownership can be safely ignored. The following command will remove our watsonx user. ./user-mgmt delete-user watsonx","title":"Delete a User"},{"location":"wxd-useradmin/#roles-and-policies","text":"In this section you will add a new user and provide them with privileges over the infrastructure and data. First start by adding a new user to the watsonx.data system. If you haven't already, make sure you are connected to the server as the root user and are in the /root/ibm-lh-dev/bin directory. Add user1 to the system with a password of password . ./user-mgmt add-user user1 User password","title":"Roles and Policies"},{"location":"wxd-useradmin/#access-control","text":"To view what users are currently authorized to use the system, select the Access control icon found on the left side of the watsonx.data UI. A list of objects that make up the watsonx.data infrastructure are displayed. You can see that the objects are made up of: Engines Catalogs Buckets In a real-world scenario where a user will be querying data from a table, that user will need to be given a minimum of User access to an engine (to be able to run the query), User access for the catalog associated with the data (to be able to see the schema information associated with the table), and Reader access to the bucket associated with the data (to be able to read the data from object storage). Additionally, a policy has to be created to permit the user to access the table in question.","title":"Access Control"},{"location":"wxd-useradmin/#granting-access","text":"Select the presto-01 engine (highlighted in red above) to view the current users that have access to the engine. At this point, only the administrative user (ibmlhadmin) can use the Presto engine. Click on the Add Access button to add a new authorized user to the list. The role button has been selected in the dialog to show the role options of Admin, Manager, or User. An Admin user can grant any role to a user, while a Manager can only grant User privileges. For this lab, grant user1 \"user\" privileges and then press Add. Close the dialog by clicking on the [x] on the top right of the screen. Now user1 needs to be granted access to a catalog. In this case, the iceberg_data and hive_data catalogs are required for the Presto engine and are implicitly granted to the user. Click on the iceberg_data line to confirm that this is the case. You should see that user1 has already been granted access to the catalog. If your version of watsonx.data Developer Edition does not have access granted to user1 , add it manually using the same steps as for engine access. Close the dialog to return to the Access control screen. The final step is to grant access to the underlying buckets. Because user was implicitly granted access to the iceberg_data and hive_data catalogs, the underlying buckets iceberg-bucket and hive-bucket were also added to their access list. Click on iceberg-bucket to view the access control. Again, if user1 does not have access to the bucket, add access using the same steps as adding engine access. When done viewing the screen, close the dialog to return to the Access control dialog.","title":"Granting Access"},{"location":"wxd-useradmin/#policies","text":"After access has been granted to engines, catalogs, and buckets, the final step is to create a policy to grant access to tables. Click on the Policy tab to display the current policies in place (there should be none). The Add Policy button is highlighted on the far right side of the screen. Pressing the button will display the new Access Control Policy dialog. Fill in the following information: Policy name: selectflights Description: blank Policy status: active You can always activate a policy after you have created it. Click Next. Here we need to select which schemas that the user will be able to access. For this example, select the ontime schema. After selecting the schema, a set of tables associated with the schema will be listed. You can choose which tables can be searched. If you choose an individual table, you can restrict which columns can be searched. Select the ontime table and then select the following columns (you will need to scroll down the page): flightdate reporting_airline flight_number_reporting_airline originairportid destairportid Once you have selected the columns, press the Next button to display the Rules dialog. The Rules dialog allows you to fine-tune what actions can be done by a user against the data. Press the Add Rule button to display the Add Rule dialog. Rules can be used to Allow actions or to Deny actions. In our example, we want to allow user1 to SELECT data from the data, but with no other options. Note : In production versions of watsonx.data, you can provide access to a group which makes it simpler to create a set of rules that apply to a group and then add a user to a group. That way a user will inherit the rules that were applied to the group rather than having to create rules for that individual. The developer edition displays GROUP as an option, but it is not implemented. Once the fields are filled in, press the Add button. You can continue to add additional rules to the list. Since we only want the one rule, select the Review button. Once you have confirmed that the data objects and rules look correct, press the Save button. The selectflights policy is now in place and is actively enforced. Before testing the policy enforcement, use the SQL icon on the left side to navigate to the hive_data catalog and view the schemas that are associated with it. Expand the ontime schema to view the tables and the columns that are available in the ontime table. When you connect as user1 , you will be able to compare what objects can be accessed from that userid.","title":"Policies"},{"location":"wxd-useradmin/#testing-policy-enforcement","text":"To test whether the rules are enforced, you will need to log out of the current watsonx.data UI session. At the top of the Access Control screen, you will see the user icon on the top right. Clicking on the icon will display the logout dialog. Log out to the main watsonx.data UI screen. You will be prompted to confirm log out. Once back to the main log in panel, enter user and password into the dialog. Your watsonx.data UI should now display user1 . Navigate to the SQL icon and then select hive_data -> ontime -> ontime . You should notice that user1 was restricted to seeing only the ontime schema in the hive_data catalog. In addition, the user was restricted to accessing one of the tables ( ontime ) and 5 columns. Attempting to run a SELECT statement against all the data will result in a policy error. Correcting the SQL to include only permitted columns results in an answer set. The policy rules have been enforced for user1 , preventing them from viewing any other schemas or tables in the system. In addition, the SQL that they could execute was restricted to specific columns in the table. Before moving onto any other sections, make sure to log out as user1 and reconnect as the ibmlhadmin user.","title":"Testing Policy Enforcement"},{"location":"wxd-vmware/","text":"IBM watsonx.data VMware Image The IBM watsonx.data lab can be run in a virtual machine environment using VMWare Workstation, VMWare Fusion, or Oracle VirtualBox. The location of the OVA file (a compressed OS image format) is provided in the TechZone page for the lab: https://techzone.ibm.com/collection/ibm-watsonxdata-developer-base-image Select the resources tab to get details on how to download the file. Download the watsonxdata.ova file onto your local machine and then use the import function of VMware or VirtualBox to register it with the system. Note : This virtual machine was created using X64 (Intel) hardware, so this will not work in an OSX environment using M1/M2 chips. Once the machine is imported you can delete the OVA file. Before starting the machine, you may want to adjust the hardware requirements. vCPUs \u2013 4 VPCs minimum Memory \u2013 16Gb minimum (You can try 12Gb but tight!) Disk \u2013 30Gb initial size, but the image will grow in size Disable side channel mitigation ON (VMware only) VMware URLs All the URLs in the lab use 192.168.252.2 as the host. When running in the VMware image, you must use localhost for the addresses. You must substitute localhost for the 192.168.252.2 address when you come across it in the documentation. The following URLs and Ports are used to access the watsonx.data services. The ports that are used in the lab are listed below. https://localhost:9443 - watsonx.data management console http://localhost:8080 - Presto console http://localhost:9001 - MinIO console (S3 buckets) https://localhost:6443 - Portainer (Docker container management) http://localhost:8088 - Apache Superset (Query and Graphing) 8443 - Presto External Port 5432 - Postgres External Port 50000 - Db2 Database Port The Apache Superset link will not be active until started as part of the lab. These links have been placed into the Firefox browser for your convenience. Starting the VMware Image When the machine starts, you will be prompted with the logon screen. There are two userids that we will be using in the VMware image: root \u2013 password watsonx.data watsonx \u2013 password watsonx.data When successfully logged in you should see the following screen. Next, check that your network connection is up and running. You will be able to see if the network is connected when the network icon appears on the top row. If it shows Wired Off, make sure to turn it on by clicking on the arrow and choosing \"Connect\". If you are using something other than an English keyboard, click on the en1 symbol on the top bar to switch to a different layout. If your keyboard is not listed, you will need to go into Settings and add your keyboard layout. You may also want to consider making the screen size larger. Use the drop-down menu at the top of the screen to select System Tools -> Settings. In the Devices section of the Setting menu, select Displays and choose a resolution that is suitable for your environment. Using External Ports with VMware/Virtual Box The labs assume that you are using a browser \"within\" your virtual machine console. However, both VMware and VirtualBox provide a method for accessing the ports on the virtual machine in your local environment. VMware For VMware, the easiest way to connect to the virtual machine from your host machine is to use the ifconfig command to determine your virtual machine IP address. ifconfig Search for an ensxx** value in the output from the command. There you should see the inet address of your virtual machine ( 172.16.210.237 ). To access the Portainer application from your local browser, you would use this address followed by the Portainer PORT number: https://172.16.210.237:6443 . Remember that inside your virtual machine, you will be using https://localhost:6443 . The following PORT numbers are open in the machine: 9443 - IBM watsonx.data management console 8080 - Presto console 9001 - MinIO console (S3 buckets) 6443 - Portainer (Docker container management) 8088 - Apache Superset (Query and Graphing) 5901 - VNC Access (Access to GUI in the machine) 7681 - SSH (Terminal access) via Browser 22 - SSH (Terminal access) via local terminal program 8443 - Presto External Port (dBeaver connection) 5432 - Postgres External Port (dBeaver connection) VirtualBox VirtualBox does not externalize the IP address of the virtual machine. The ifconfig command will provide an IP address of the machine, but it will not be reachable from your host browser. To open the ports, you must use the network option on the virtual machine. This step can be done while the machine is running. From the VirtualBox console, choose Settings for the machine and then click on the Network option. Press the Advanced option near the bottom of the dialog. Select the Port Forwarding button. This will display the port forwarding menu. You must place an entry for each port that we want to externalize to the host machine. If the value for Host IP is empty (blank), it defaults to localhost. In the example above, the 5901 port in the Guest machine (watsonxdata) is mapped to the host machines 5901 port. To access VNC, you would use localhost:5901 . If the guest machine port conflicts with the host machine port number, you can use a different port number. Terminal Command Window All the commands in the lab will require you execute commands in a terminal window. In addition, the labs require access to the root userid, and this can be accomplished in two ways that are described below. Local Terminal Shell Use a local terminal shell (iterm, Hyper, terminal) and use the SSH command to shell into the machine. For the VMware image, you need to know the IP address of the image and the port number that has been exposed for SSH command (default is 22). Assuming that your VMware machine has an IP address of 172.16.210.237 , the command to SSH into the machine would be: ssh watsonx@172.16.210.237 You will need to accept the unknown host warning and then provide the password for the watsonx userid: watsonx.data . At this point you are connected as the watsonx user. To become the root user, you must enter the following command in the terminal window. sudo su - Now as the root user you will be ready to run the commands found in the lab. Terminal Window in Virtual Machine You can use the Terminal application in the virtual machine to issue commands. This will open up the terminal window. At this point you are connected as the watsonx user. You can ignore any lab instructions that ask you to ssh into the watsonx server. To become the root user, you must enter the following command in the terminal window. sudo su - Now as the root user you will be ready to run the commands found in the lab.","title":"VMWare Image"},{"location":"wxd-vmware/#ibm-watsonxdata-vmware-image","text":"The IBM watsonx.data lab can be run in a virtual machine environment using VMWare Workstation, VMWare Fusion, or Oracle VirtualBox. The location of the OVA file (a compressed OS image format) is provided in the TechZone page for the lab: https://techzone.ibm.com/collection/ibm-watsonxdata-developer-base-image Select the resources tab to get details on how to download the file. Download the watsonxdata.ova file onto your local machine and then use the import function of VMware or VirtualBox to register it with the system. Note : This virtual machine was created using X64 (Intel) hardware, so this will not work in an OSX environment using M1/M2 chips. Once the machine is imported you can delete the OVA file. Before starting the machine, you may want to adjust the hardware requirements. vCPUs \u2013 4 VPCs minimum Memory \u2013 16Gb minimum (You can try 12Gb but tight!) Disk \u2013 30Gb initial size, but the image will grow in size Disable side channel mitigation ON (VMware only)","title":"IBM watsonx.data VMware Image"},{"location":"wxd-vmware/#vmware-urls","text":"All the URLs in the lab use 192.168.252.2 as the host. When running in the VMware image, you must use localhost for the addresses. You must substitute localhost for the 192.168.252.2 address when you come across it in the documentation. The following URLs and Ports are used to access the watsonx.data services. The ports that are used in the lab are listed below. https://localhost:9443 - watsonx.data management console http://localhost:8080 - Presto console http://localhost:9001 - MinIO console (S3 buckets) https://localhost:6443 - Portainer (Docker container management) http://localhost:8088 - Apache Superset (Query and Graphing) 8443 - Presto External Port 5432 - Postgres External Port 50000 - Db2 Database Port The Apache Superset link will not be active until started as part of the lab. These links have been placed into the Firefox browser for your convenience.","title":"VMware URLs"},{"location":"wxd-vmware/#starting-the-vmware-image","text":"When the machine starts, you will be prompted with the logon screen. There are two userids that we will be using in the VMware image: root \u2013 password watsonx.data watsonx \u2013 password watsonx.data When successfully logged in you should see the following screen. Next, check that your network connection is up and running. You will be able to see if the network is connected when the network icon appears on the top row. If it shows Wired Off, make sure to turn it on by clicking on the arrow and choosing \"Connect\". If you are using something other than an English keyboard, click on the en1 symbol on the top bar to switch to a different layout. If your keyboard is not listed, you will need to go into Settings and add your keyboard layout. You may also want to consider making the screen size larger. Use the drop-down menu at the top of the screen to select System Tools -> Settings. In the Devices section of the Setting menu, select Displays and choose a resolution that is suitable for your environment.","title":"Starting the VMware Image"},{"location":"wxd-vmware/#using-external-ports-with-vmwarevirtual-box","text":"The labs assume that you are using a browser \"within\" your virtual machine console. However, both VMware and VirtualBox provide a method for accessing the ports on the virtual machine in your local environment.","title":"Using External Ports with VMware/Virtual Box"},{"location":"wxd-vmware/#vmware","text":"For VMware, the easiest way to connect to the virtual machine from your host machine is to use the ifconfig command to determine your virtual machine IP address. ifconfig Search for an ensxx** value in the output from the command. There you should see the inet address of your virtual machine ( 172.16.210.237 ). To access the Portainer application from your local browser, you would use this address followed by the Portainer PORT number: https://172.16.210.237:6443 . Remember that inside your virtual machine, you will be using https://localhost:6443 . The following PORT numbers are open in the machine: 9443 - IBM watsonx.data management console 8080 - Presto console 9001 - MinIO console (S3 buckets) 6443 - Portainer (Docker container management) 8088 - Apache Superset (Query and Graphing) 5901 - VNC Access (Access to GUI in the machine) 7681 - SSH (Terminal access) via Browser 22 - SSH (Terminal access) via local terminal program 8443 - Presto External Port (dBeaver connection) 5432 - Postgres External Port (dBeaver connection)","title":"VMware"},{"location":"wxd-vmware/#virtualbox","text":"VirtualBox does not externalize the IP address of the virtual machine. The ifconfig command will provide an IP address of the machine, but it will not be reachable from your host browser. To open the ports, you must use the network option on the virtual machine. This step can be done while the machine is running. From the VirtualBox console, choose Settings for the machine and then click on the Network option. Press the Advanced option near the bottom of the dialog. Select the Port Forwarding button. This will display the port forwarding menu. You must place an entry for each port that we want to externalize to the host machine. If the value for Host IP is empty (blank), it defaults to localhost. In the example above, the 5901 port in the Guest machine (watsonxdata) is mapped to the host machines 5901 port. To access VNC, you would use localhost:5901 . If the guest machine port conflicts with the host machine port number, you can use a different port number.","title":"VirtualBox"},{"location":"wxd-vmware/#terminal-command-window","text":"All the commands in the lab will require you execute commands in a terminal window. In addition, the labs require access to the root userid, and this can be accomplished in two ways that are described below.","title":"Terminal Command Window"},{"location":"wxd-vmware/#local-terminal-shell","text":"Use a local terminal shell (iterm, Hyper, terminal) and use the SSH command to shell into the machine. For the VMware image, you need to know the IP address of the image and the port number that has been exposed for SSH command (default is 22). Assuming that your VMware machine has an IP address of 172.16.210.237 , the command to SSH into the machine would be: ssh watsonx@172.16.210.237 You will need to accept the unknown host warning and then provide the password for the watsonx userid: watsonx.data . At this point you are connected as the watsonx user. To become the root user, you must enter the following command in the terminal window. sudo su - Now as the root user you will be ready to run the commands found in the lab.","title":"Local Terminal Shell"},{"location":"wxd-vmware/#terminal-window-in-virtual-machine","text":"You can use the Terminal application in the virtual machine to issue commands. This will open up the terminal window. At this point you are connected as the watsonx user. You can ignore any lab instructions that ask you to ssh into the watsonx server. To become the root user, you must enter the following command in the terminal window. sudo su - Now as the root user you will be ready to run the commands found in the lab.","title":"Terminal Window in Virtual Machine"},{"location":"wxd-watsonui/","text":"Using the watsonx.data console UI Your TechZone reservation will include the server name and port number to use when connecting to the watsonx.data UI. The default port number is 9443, while the server will be referred to as region.techzone-server.com . Replace these values with those found in your reservation. Open your browser and navigate to: Watsonx.data UI - https://region.techzone-server.com:port Credentials: username: ibmlhadmin password: password Note: You will get a Certificate error in Firefox: Select Advanced. Choose \"Accept the Risk and Continue\". If you are using Google Chrome, you can bypass the error message by typing in \"thisisunsafe\" or clicking on the \"Proceed to server name (unsafe)\" link. The server name value will be replaced with the name of the TechZone server you are connecting to. The watsonx.data UI will display. The userid is ibmlhadmin with password of password . Note : If you see the following screen when first connecting to the UI, this is an indication that the service has not completely initialized. Dismiss all the error messages and then click on the Person icon (far right side above the messages) and Logout. Close the browser window after logging out and open the web page again until you get the proper login screen. At this point you will be connected to the console. Watsonx.data UI Navigation The main screen provides a snapshot of the objects that are currently found in the watsonx.data system. The infrastructure components shows that there is 1 engine, 2 catalogs and 2 buckets associated with the system. You can examine these objects by using the menu system found at the left side of the screen. Click on the hamburger icon. This will provide a list of items that you can explore in the UI. You can also access this list by clicking on one of the following icons. You can explore the various menus to see how the UI works. A brief description of the items is found below. Infrastructure manager - Displays the current engines, buckets and databases associated with the installation. Data Manager - Used to explore the various data sources that are catalogued in the system. You can explore the schemas, tables, table layout and view a subset of the data with this option. The display make take a few minutes to show the schemas in the system as it is querying the catalog and populating the descriptions on the screen. Query Workplace - A SQL-based query tool for accessing the data. Query History - A list of SQL queries that were previously run across all engines. Access Control - Control who can access the data. Try using the Data Explorer and Query engine to access some of the data in the pre-defined TPCH schema.","title":"watsonx.data UI"},{"location":"wxd-watsonui/#using-the-watsonxdata-console-ui","text":"Your TechZone reservation will include the server name and port number to use when connecting to the watsonx.data UI. The default port number is 9443, while the server will be referred to as region.techzone-server.com . Replace these values with those found in your reservation. Open your browser and navigate to: Watsonx.data UI - https://region.techzone-server.com:port Credentials: username: ibmlhadmin password: password Note: You will get a Certificate error in Firefox: Select Advanced. Choose \"Accept the Risk and Continue\". If you are using Google Chrome, you can bypass the error message by typing in \"thisisunsafe\" or clicking on the \"Proceed to server name (unsafe)\" link. The server name value will be replaced with the name of the TechZone server you are connecting to. The watsonx.data UI will display. The userid is ibmlhadmin with password of password . Note : If you see the following screen when first connecting to the UI, this is an indication that the service has not completely initialized. Dismiss all the error messages and then click on the Person icon (far right side above the messages) and Logout. Close the browser window after logging out and open the web page again until you get the proper login screen. At this point you will be connected to the console.","title":"Using the watsonx.data console UI"},{"location":"wxd-watsonui/#watsonxdata-ui-navigation","text":"The main screen provides a snapshot of the objects that are currently found in the watsonx.data system. The infrastructure components shows that there is 1 engine, 2 catalogs and 2 buckets associated with the system. You can examine these objects by using the menu system found at the left side of the screen. Click on the hamburger icon. This will provide a list of items that you can explore in the UI. You can also access this list by clicking on one of the following icons. You can explore the various menus to see how the UI works. A brief description of the items is found below. Infrastructure manager - Displays the current engines, buckets and databases associated with the installation. Data Manager - Used to explore the various data sources that are catalogued in the system. You can explore the schemas, tables, table layout and view a subset of the data with this option. The display make take a few minutes to show the schemas in the system as it is querying the catalog and populating the descriptions on the screen. Query Workplace - A SQL-based query tool for accessing the data. Query History - A list of SQL queries that were previously run across all engines. Access Control - Control who can access the data. Try using the Data Explorer and Query engine to access some of the data in the pre-defined TPCH schema.","title":"Watsonx.data UI Navigation"}]}
\ No newline at end of file
+{"config":{"indexing":"full","lang":["en"],"min_search_length":3,"prebuild_index":false,"separator":"[\\s\\-]+"},"docs":[{"location":"","text":"Introducing watsonx.data The next-gen watsonx.data lakehouse is designed to overcome the costs and complexities enterprises face. This will be the world\u2019s first and only open data store with multi-engine support that is built for hybrid deployment across your entire ecosystem. WatsonX.data is the only lakehouse with multiple query engines allowing you to optimize costs and performance by pairing the right workload with the right engine. Run all workloads from a single pane of glass, eliminating trade-offs with convenience while still improving cost and performance. Deploy anywhere with full support for hybrid-cloud and multi cloud environments. Shared metadata across multiple engines eliminates the need to re-catalog, accelerating time to value while ensuring governance and eliminating costly implementation efforts. This lab uses the watsonx.data developer package. The Developer package is meant to be used on single nodes. While it uses the same code base, there are some restrictions, especially on scale. In this lab, we will open some additional ports as well to understand how everything works. We will also use additional utilities to illustrate connectivity and what makes the watsonx.data system \"open\". We organized this lab into a number of sections that cover many of the highlights and key features of watsonx.data. Access a TechZone or VMWare image for testing Checking watsonx.data status Introduction to watsonx.data components Analytical SQL Advanced SQL functions Time Travel and Federation Working with Object Store Buckets In addition, there is an Appendix which includes common errors and potential fixes or workarounds. Watsonx.data Developer Image The watsonx.data system is running on a virtual machine with the following resources: 4 vCPUs 16Gb of memory 400Gb of disk This is sufficient for running this exercises found in this lab but should not be used for performance testing or dealing with large data sets. Watsonx.data Level 3 Technical Training This system is used as a basis for the watsonx.data Level 3 Technical Training. For the detailed lab material, please refer to the following PDF found in Seismic: https://ibm.seismic.com/Link/Content/DCG37pjmPj7VmGCHj2Df8fHVmDJj","title":"Introducing watsonx.data"},{"location":"#introducing-watsonxdata","text":"The next-gen watsonx.data lakehouse is designed to overcome the costs and complexities enterprises face. This will be the world\u2019s first and only open data store with multi-engine support that is built for hybrid deployment across your entire ecosystem. WatsonX.data is the only lakehouse with multiple query engines allowing you to optimize costs and performance by pairing the right workload with the right engine. Run all workloads from a single pane of glass, eliminating trade-offs with convenience while still improving cost and performance. Deploy anywhere with full support for hybrid-cloud and multi cloud environments. Shared metadata across multiple engines eliminates the need to re-catalog, accelerating time to value while ensuring governance and eliminating costly implementation efforts. This lab uses the watsonx.data developer package. The Developer package is meant to be used on single nodes. While it uses the same code base, there are some restrictions, especially on scale. In this lab, we will open some additional ports as well to understand how everything works. We will also use additional utilities to illustrate connectivity and what makes the watsonx.data system \"open\". We organized this lab into a number of sections that cover many of the highlights and key features of watsonx.data. Access a TechZone or VMWare image for testing Checking watsonx.data status Introduction to watsonx.data components Analytical SQL Advanced SQL functions Time Travel and Federation Working with Object Store Buckets In addition, there is an Appendix which includes common errors and potential fixes or workarounds.","title":"Introducing watsonx.data"},{"location":"#watsonxdata-developer-image","text":"The watsonx.data system is running on a virtual machine with the following resources: 4 vCPUs 16Gb of memory 400Gb of disk This is sufficient for running this exercises found in this lab but should not be used for performance testing or dealing with large data sets.","title":"Watsonx.data Developer Image"},{"location":"#watsonxdata-level-3-technical-training","text":"This system is used as a basis for the watsonx.data Level 3 Technical Training. For the detailed lab material, please refer to the following PDF found in Seismic: https://ibm.seismic.com/Link/Content/DCG37pjmPj7VmGCHj2Df8fHVmDJj","title":"Watsonx.data Level 3 Technical Training"},{"location":"wxd-acknowledgements/","text":"Acknowledgments We would like to thank all the development team for helping to deliver this release given the tremendous deadlines and constraints that they have been under. The initial lab was created by Deepak Rangarao with contributions from development. Additional material was supplied by Daniel Hancock and feedback from the members of the watsonx.data activation community. Formatting and script development was done by George Baklarz. The contents of this eBook are the result of a lot of research and testing based on the contents of watsonx.data. Results are based on a specific version of watsonx.data, so you may have different results if using an older or newer version of the development kit. Support For any questions regarding the lab, including any suggestions, general comments, or bug reports, please contact: George Baklarz baklarz@ca.ibm.com Daniel Hancock daniel.hancock@us.ibm.com We would also appreciate any feedback on the successful use of the lab. Thanks for using watsonx.data! Dan, Deepak & George","title":"Acknowledgements"},{"location":"wxd-acknowledgements/#acknowledgments","text":"We would like to thank all the development team for helping to deliver this release given the tremendous deadlines and constraints that they have been under. The initial lab was created by Deepak Rangarao with contributions from development. Additional material was supplied by Daniel Hancock and feedback from the members of the watsonx.data activation community. Formatting and script development was done by George Baklarz. The contents of this eBook are the result of a lot of research and testing based on the contents of watsonx.data. Results are based on a specific version of watsonx.data, so you may have different results if using an older or newer version of the development kit.","title":"Acknowledgments"},{"location":"wxd-acknowledgements/#support","text":"For any questions regarding the lab, including any suggestions, general comments, or bug reports, please contact: George Baklarz baklarz@ca.ibm.com Daniel Hancock daniel.hancock@us.ibm.com We would also appreciate any feedback on the successful use of the lab. Thanks for using watsonx.data! Dan, Deepak & George","title":"Support"},{"location":"wxd-advanced/","text":"Advanced Functions Watsonx.data supports several types of functions including: Mathematical functions Conversion functions String functions Regular expression functions Window functions URL functions Geospatial functions For a complete list see - https://prestodb.io/docs/current/functions.html . We will look at using a few simple examples as part of this lab. Switch to the bin directory. cd /root/ibm-lh-dev/bin Connect to the Workshop Schema. ./presto-cli --catalog iceberg_data --schema workshop Concatenation of one or more string/varchar values Note: We are using a combination of the concat string function and the cast conversion function as part of this query. select concat(cast(custkey as varchar),'--',name) from customer limit 2; _col0 ------------------------- 376--Customer#000000376 377--Customer#000000377 (2 rows) Date functions Date functions can be used as part of the projected columns or in the predicate/where clause. Select orders from the last 2 days. select orderdate from orders where orderdate > date '1998-08-02' - interval '2' day; orderdate ------------ 1998-08-02 1998-08-02 1998-08-01 1998-08-01 1998-08-02 1998-08-01 1998-08-01 1998-08-01 1998-08-02 1998-08-02 1998-08-02 1998-08-02 (12 rows) Number of orders by year. select distinct year(orderdate), count(orderkey) from orders group by year(orderdate); _col0 | _col1 -------+------- 1993 | 2307 1994 | 2303 1998 | 1346 1996 | 2297 1995 | 2204 1992 | 2256 1997 | 2287 (7 rows) Geospatial functions There are 3 basic geometries, then some complex geometries. The basic geometries include: Points Lines Polygons Points You could use https://www.latlong.net to get the longitude/latitude given any address. select ST_Point(-121.748360,37.195840) as SVL, ST_Point(-122.378952, 37.621311) as SFO; SVL | SFO -----------------------------+------------------------------- POINT (-121.74836 37.19584) | POINT (-122.378952 37.621311) (1 row) Lines You could use https://www.latlong.net to get the longitude/latitude for 2 points and then create a straight line from it. Below is just a small stretch of the road leading to IBM SVL campus. select ST_LineFromText('LINESTRING (-121.74294303079807 37.19665657093434, -121.73659072815602 37.20102399761407)'); _col0 ------------------------------------------------------------------------------------------- LINESTRING (-121.74294303079807 37.19665657093434, -121.73659072815602 37.20102399761407) (1 row) Polygons You could use https://geojson.io/#map=16.39/37.196336/-121.746303 to click around and generate the coordinates for a polygon of any shape. The following is a polygon of the IBM Silicon Valley campus. select ST_Polygon('POLYGON ( (-121.74418635253568 37.196001834113844, -121.74499684288966 37.19668005184322, -121.74584008032835 37.19707784979194, -121.74629035274705 37.197645197338105, -121.74672425162339 37.198186455965086, -121.74705172247337 37.19828427337538, -121.74760023614738 37.19827775221884, -121.74848440744239 37.19836252721197, -121.74932764488139 37.19789300297414, -121.75039192514376 37.19746260319114, -121.75130884352407 37.19721479614175, -121.75195559845278 37.1963670290329, -121.75198015876644 37.19555185937345, -121.7508585711051 37.19458016564036, -121.74940132582242 37.19447582194559, -121.74841891327239 37.1942866986312, -121.7474446874937 37.193556286900346, -121.74418635253568 37.196001834113844))'); Truncated output ------------------------------------------------------------------------------------------------------------------------------------------------------> POLYGON ((-121.74418635253568 37.196001834113844, -121.74499684288966 37.19668005184322, -121.74584008032835 37.19707784979194, -121.74629035274705 3> (1 row) So now that we have 3 basic geometries Point, Line and Polygon we can perform different operations on spatial data including: Distance between 2 points Point in polygon Intersection of line and polygon \u2003 Distance between SFO airport and IBM SVL We can now use geospatial functions in a nested way to find the distance between 2 points. select ST_Distance(to_spherical_geography(ST_Point(-122.378952, 37.621311)), to_spherical_geography(ST_Point(-121.748360,37.195840)))*0.000621371 as distance_in_miles; distance_in_miles -------------------- 45.408431373195654 (1 row) Exit Presto. quit;","title":"Advanced Functions"},{"location":"wxd-advanced/#advanced-functions","text":"Watsonx.data supports several types of functions including: Mathematical functions Conversion functions String functions Regular expression functions Window functions URL functions Geospatial functions For a complete list see - https://prestodb.io/docs/current/functions.html . We will look at using a few simple examples as part of this lab. Switch to the bin directory. cd /root/ibm-lh-dev/bin Connect to the Workshop Schema. ./presto-cli --catalog iceberg_data --schema workshop","title":"Advanced Functions"},{"location":"wxd-advanced/#concatenation-of-one-or-more-stringvarchar-values","text":"Note: We are using a combination of the concat string function and the cast conversion function as part of this query. select concat(cast(custkey as varchar),'--',name) from customer limit 2; _col0 ------------------------- 376--Customer#000000376 377--Customer#000000377 (2 rows)","title":"Concatenation of one or more string/varchar values"},{"location":"wxd-advanced/#date-functions","text":"Date functions can be used as part of the projected columns or in the predicate/where clause. Select orders from the last 2 days. select orderdate from orders where orderdate > date '1998-08-02' - interval '2' day; orderdate ------------ 1998-08-02 1998-08-02 1998-08-01 1998-08-01 1998-08-02 1998-08-01 1998-08-01 1998-08-01 1998-08-02 1998-08-02 1998-08-02 1998-08-02 (12 rows) Number of orders by year. select distinct year(orderdate), count(orderkey) from orders group by year(orderdate); _col0 | _col1 -------+------- 1993 | 2307 1994 | 2303 1998 | 1346 1996 | 2297 1995 | 2204 1992 | 2256 1997 | 2287 (7 rows)","title":"Date functions"},{"location":"wxd-advanced/#geospatial-functions","text":"There are 3 basic geometries, then some complex geometries. The basic geometries include: Points Lines Polygons","title":"Geospatial functions"},{"location":"wxd-advanced/#points","text":"You could use https://www.latlong.net to get the longitude/latitude given any address. select ST_Point(-121.748360,37.195840) as SVL, ST_Point(-122.378952, 37.621311) as SFO; SVL | SFO -----------------------------+------------------------------- POINT (-121.74836 37.19584) | POINT (-122.378952 37.621311) (1 row)","title":"Points"},{"location":"wxd-advanced/#lines","text":"You could use https://www.latlong.net to get the longitude/latitude for 2 points and then create a straight line from it. Below is just a small stretch of the road leading to IBM SVL campus. select ST_LineFromText('LINESTRING (-121.74294303079807 37.19665657093434, -121.73659072815602 37.20102399761407)'); _col0 ------------------------------------------------------------------------------------------- LINESTRING (-121.74294303079807 37.19665657093434, -121.73659072815602 37.20102399761407) (1 row)","title":"Lines"},{"location":"wxd-advanced/#polygons","text":"You could use https://geojson.io/#map=16.39/37.196336/-121.746303 to click around and generate the coordinates for a polygon of any shape. The following is a polygon of the IBM Silicon Valley campus. select ST_Polygon('POLYGON ( (-121.74418635253568 37.196001834113844, -121.74499684288966 37.19668005184322, -121.74584008032835 37.19707784979194, -121.74629035274705 37.197645197338105, -121.74672425162339 37.198186455965086, -121.74705172247337 37.19828427337538, -121.74760023614738 37.19827775221884, -121.74848440744239 37.19836252721197, -121.74932764488139 37.19789300297414, -121.75039192514376 37.19746260319114, -121.75130884352407 37.19721479614175, -121.75195559845278 37.1963670290329, -121.75198015876644 37.19555185937345, -121.7508585711051 37.19458016564036, -121.74940132582242 37.19447582194559, -121.74841891327239 37.1942866986312, -121.7474446874937 37.193556286900346, -121.74418635253568 37.196001834113844))'); Truncated output ------------------------------------------------------------------------------------------------------------------------------------------------------> POLYGON ((-121.74418635253568 37.196001834113844, -121.74499684288966 37.19668005184322, -121.74584008032835 37.19707784979194, -121.74629035274705 3> (1 row) So now that we have 3 basic geometries Point, Line and Polygon we can perform different operations on spatial data including: Distance between 2 points Point in polygon Intersection of line and polygon \u2003 Distance between SFO airport and IBM SVL We can now use geospatial functions in a nested way to find the distance between 2 points. select ST_Distance(to_spherical_geography(ST_Point(-122.378952, 37.621311)), to_spherical_geography(ST_Point(-121.748360,37.195840)))*0.000621371 as distance_in_miles; distance_in_miles -------------------- 45.408431373195654 (1 row) Exit Presto. quit;","title":"Polygons"},{"location":"wxd-analytics/","text":"Analytic Workloads Watsonx.data is based on open source PrestoDB, a distributed query engine that enables querying data stored in open file formats using open table formats for optimization and performance. Some of the characteristics which you will learn and see in action include: Compute processing is performed in memory and in parallel. Data is pipelined between query stages and over the network reducing latency overhead that one would have if disk I/O were involved. Executing and analyzing analytic workloads Let us start with some simple examples of running queries and analyze the execution. We can either use the dBeaver interface or the watsonx.data CLI. We will eventually be able to use the watsonx.data console UI as well but for the moment it is under construction. Connect to watsonx.data Make sure you are the root user and change to the development directory. cd /root/ibm-lh-dev/bin Open the Presto CLI. Note : The workshop schema was created as part of the introduction to Minio. If you have not run that lab, the schema will not be available. Please see the Introduction to Minio section. ./presto-cli --catalog iceberg_data --schema workshop Run a simple scan query which selects customer names and market segment. select name, mktsegment from customer limit 3; name | mktsegment --------------------+------------ Customer#000000376 | AUTOMOBILE Customer#000000377 | MACHINERY Customer#000000378 | BUILDING (3 rows) To understand the query execution plan we use the explain statement. explain select name, mktsegment from customer; - Output[name, mktsegment] => [name:varchar, mktsegment:varchar] Estimates: {rows: 1500 (15.85kB), cpu: 16230.00, memory: 0.00, network: 16230.00} - RemoteStreamingExchange[GATHER] => [name:varchar, mktsegment:varchar] Estimates: {rows: 1500 (15.85kB), cpu: 16230.00, memory: 0.00, network: 16230.00} - TableScan[TableHandle {connectorId='iceberg_data', connectorHandle='workshop.customer$data@Optional[7053670466726060568]', layout='Optional[workshop.customer$data@Optional[7053670466726060568]]'}] => [name:varchar, mktsegment:varchar] Estimates: {rows: 1500 (15.85kB), cpu: 16230.00, memory: 0.00, network: 0.00} mktsegment := 7:mktsegment:varchar (1:38) name := 2:name:varchar (1:38) What you see above is the hierarchy of logical operations to execute the query. Explain the query and focus on IO operations. explain (type io) select name, mktsegment from customer; { \"inputTableColumnInfos\" : [ { \"table\" : { \"catalog\" : \"iceberg_data\", \"schemaTable\" : { \"schema\" : \"workshop\", \"table\" : \"customer\" } }, \"columnConstraints\" : [ ] } ] } Explain physical execution plan for the query. explain (type distributed) select name, mktsegment from customer; Fragment 0 [SINGLE] Output layout: [name, mktsegment] Output partitioning: SINGLE [] Stage Execution Strategy: UNGROUPED_EXECUTION - Output[name, mktsegment] => [name:varchar, mktsegment:varchar] Estimates: {rows: 1500 (15.85kB), cpu: 16230.00, memory: 0.00, network: 16230.00} - RemoteSource[1] => [name:varchar, mktsegment:varchar] Fragment 1 [SOURCE] Output layout: [name, mktsegment] Output partitioning: SINGLE [] Stage Execution Strategy: UNGROUPED_EXECUTION - TableScan[TableHandle {connectorId='iceberg_data', connectorHandle='workshop.customer$data@Optional[7053670466726060568]', layout='Optional[workshop.customer$data@Optional[7053670466726060568]]'}, grouped = false] => [name:varchar, mktsegment:varchar] Estimates: {rows: 1500 (15.85kB), cpu: 16230.00, memory: 0.00, network: 0.00} mktsegment := 7:mktsegment:varchar (1:57) name := 2:name:varchar (1:57) A fragment represents a stage of the distributed plan. The Presto scheduler schedules the execution by each stage, and stages can be run on separate instances. Create explain statement in a visual format. explain (format graphviz) select name, mktsegment from customer; digraph logical_plan { subgraph cluster_0 { label = \"SINGLE\" plannode_1[label=\"{Output[name, mktsegment]|Estimates: \\{rows: ? (?), cpu: ?, memory: ?, network: ?\\} }\", style=\"rounded, filled\", shape=record, fillcolor=white]; plannode_2[label=\"{ExchangeNode[GATHER]|name, mktsegment|Estimates: \\{rows: ? (?), cpu: ?, memory: ?, network: ?\\} }\", style=\"rounded, filled\", shape=record, fillcolor=gold]; plannode_3[label=\"{TableScan | [TableHandle \\{connectorId='iceberg_data', connectorHandle='workshop.customer$data@Optional[7053670466726060568]', layout='Optional[workshop.customer$data@Optional[7053670466726060568]]'\\}]|Estimates: \\{rows: ? (?), cpu: ?, memory: ?, network: ?\\} }\", style=\"rounded, filled\", shape=record, fillcolor=deepskyblue]; } plannode_1 -> plannode_2; plannode_2 -> plannode_3; } We are going to format the output from the explain statement and display it as a graphic. Quit Presto. quit; Place the explain SQL into a file that will be run as a script by Presto. cat <<EOF >/root/ibm-lh-dev/localstorage/volumes/infra/explain.sql explain (format graphviz) select name, mktsegment from customer; EOF Run Presto by pointing to the file with the SQL in it. ./presto-cli --catalog iceberg_data --schema workshop --file /mnt/infra/explain.sql > /tmp/plan.dot We need to get rid of headers and stuff that Presto generated when creating the output (there is no way to turn that off). cat /tmp/plan.dot | sed 's/\"\"/\"/g' | sed -z 's/\"//' | sed '$s/\"//' > /tmp/fixedplan.dot Generate the PNG file from the explain statement. dot -Tpng /tmp/fixedplan.dot > /tmp/plan.png Open a separate terminal window and issue the following command (using the SSH port number and server name supplied in your reservation). Mac OSX user scp -port watsonx@region.techzone-server.com:/tmp/plan.png plan.png && open plan.png Windows user scp -port watsonx@region.techzone-server.com:/tmp/plan.png plan.png & start \"\" \"plan.png\" Linux user (watsonx.data server) eog /tmp/plan.png Creating a Table with User-defined Partitions Connect to Presto with the Workshop Schema. ./presto-cli --catalog iceberg_data --schema workshop Create a partitioned table, based on column mktsegment and copy data from TPCH.TINY.CUSTOMER table. create table iceberg_data.workshop.part_customer with (partitioning = array['mktsegment']) as select * from tpch.tiny.customer; Quit Presto. quit; Inspect object store directory/object/file structure Open your browser and connect to the MinIO console. If you forget the userid and password, use the following command to extract them or use the passwords command. export LH_S3_ACCESS_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_ACCESS_KEY | sed 's/.*=//') export LH_S3_SECRET_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_SECRET_KEY | sed 's/.*=//') echo \"MinIO Userid : \" $LH_S3_ACCESS_KEY echo \"MinIO Password: \" $LH_S3_SECRET_KEY Click on the Object browser tab to show the current buckets in the MinIO system. Select iceberg-bucket. You will see two tables, customer and part_customer. Select part_customer. Then select data. Examining the part_customer, you will notice is the data is split into multiple parquet files stored across multiple directories - a single directory for each unique value of the partition key. Predicate query to utilize partitions Connect to Presto with the Workshop Schema. ./presto-cli --catalog iceberg_data --schema workshop Now that have created a partitioned table, we will execute a SQL statement that will make use of this fact. select * from iceberg_data.\"workshop\".part_customer where mktsegment='MACHINERY'; custkey | name | address | nationkey | phone | acctbal | mktsegment | comment ---------+--------------------+------------------------------------------+-----------+-----------------+---------+------------+---------------------------------------------------------------------------------------------------------------------- 1131 | Customer#000001131 | KVAvB1lwuN qHWDDPNckenmRGULDFduxYRSBXv | 20 | 30-644-540-9044 | 6019.1 | MACHINERY | er the carefully dogged courts m 1133 | Customer#000001133 | FfA0o cMP02Ylzxtmbq8DCOq | 14 | 24-858-762-2348 | 5335.36 | MACHINERY | g to the pending, ironic pinto beans. furiously blithe packages are fina 1141 | Customer#000001141 | A6uzuXpgRPp19ek8K8zd5O | 22 | 32-330-618-9020 | 0.97 | MACHINERY | accounts. furiously pending deposits cajole. c 1149 | Customer#000001149 | 5JOAwCy8MD70TUZJDyxgEBMe | 3 | 13-254-242-3889 | 6287.79 | MACHINERY | ress requests haggle carefully across the fluffily regula 1150 | Customer#000001150 | fUJqzdkQg1 | 21 | 31-236-665-8430 | -117.31 | MACHINERY | usly final dolphins. fluffily bold platelets sleep. slyly unusual attainments lo 1155 | Customer#000001155 | kEDBn1IQWyHyYjgGGs6FiXfm3 | 8 | 18-864-953-3058 | 3510.25 | MACHINERY | ages? fluffily even accounts shall have to boost furiously alongside of the furiously pendin 1158 | Customer#000001158 | btAl2dQdvNV9cEzTwVRloTb08sLYKDopV2cK,p | 10 | 20-487-747-8857 | 3081.79 | MACHINERY | theodolites use stealthy asymptotes. frets integrate even instructions. car 1161 | Customer#000001161 | QD7s2P6QpCC6g9t2aVzKg7y | 19 | 29-213-663-3342 | 591.31 | MACHINERY | ly alongside of the quickly blithe ideas. quickly ironic accounts haggle regul 1165 | Customer#000001165 | h7KTXGSqsn0 | 9 | 19-766-409-6769 | 8177.33 | MACHINERY | jole slyly beside the quickly final accounts. silent, even requests are stealthily ironic, re 1166 | Customer#000001166 | W4FAGNPKcJFebzldtNp8SehhH3 | 17 | 27-869-223-7506 | 507.26 | MACHINERY | before the platelets! carefully bold ideas lose carefully 1169 | Customer#000001169 | 04YQNIYyRRFxUnJsTP36da | 4 | 14-975-169-9356 | 7503.3 | MACHINERY | into beans doubt about the slyly ironic multipliers. carefully regular requests breach theodolites. special packages 1188 | Customer#000001188 | PtwoF3jNQ9r6 GbPIelt GvbNBuDH | 15 | 25-108-989-8154 | 3698.86 | MACHINERY | ts. quickly unusual ideas affix aft 1190 | Customer#000001190 | JwzW9OtxFRXDnVo5hXl8 2A5VxH12 | 15 | 25-538-604-9042 | 2743.63 | MACHINERY | regular deposits according to the pending packages wake blithely among the silent inst 1203 | Customer#000001203 | 9pTq4gggfKoSqQetn0yJR | 16 | 26-370-660-6154 | 5787.69 | MACHINERY | osits nag furiously final accounts. silent pack ... Many more rows Due to the partitioning of this table by mktsegment , it will completely skip scanning a large percentage of the objects in the object store. We run an explain against this query using the following command. explain (format graphviz) select * from iceberg_data.\"workshop\".customer where mktsegment='MACHINERY'; Query Plan ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- digraph logical_plan { subgraph cluster_0 { label = \"SINGLE\" plannode_1[label=\"{Output[custkey, name, address, nationkey, phone, acctbal, mktsegment, comment]|Estimates: \\{rows: 750 (56.84kB), cpu: 232830.00, memory: 0.00, network: 58207.50\\} }\", style=\"rounded, filled\", shape=record, fillcolor=white]; plannode_2[label=\"{ExchangeNode[GATHER]|custkey, name, address, nationkey, phone, acctbal, mktsegment, comment|Estimates: \\{rows: 750 (56.84kB), cpu: 232830.00, memory: 0.00, network: 58207.50\\} }\", style=\"rounded, filled\", shape=record, fillcolor=gold]; plannode_3[label=\"{Filter|(mktsegment) = (VARCHAR'MACHINERY')|Estimates: \\{rows: 750 (56.84kB), cpu: 232830.00, memory: 0.00, network: 0.00\\} }\", style=\"rounded, filled\", shape=record, fillcolor=yellow]; plannode_4[label=\"{TableScan | [TableHandle \\{connectorId='iceberg_data', connectorHandle='workshop.customer$data@Optional[7230522396120575591]', layout='Optional[workshop.customer$data@Optional[7230522396120575591]]'\\}]|Estimates: \\{rows: 1500 (113.69kB), cpu: 116415.00, memory: 0.00, network: 0.00\\} }\", style=\"rounded, filled\", shape=record, fillcolor=deepskyblue]; } plannode_1 -> plannode_2; plannode_2 -> plannode_3; plannode_3 -> plannode_4; } To visualize this, we are going to run this command and place the results into a temporary file. Exit Presto. quit; Place the explain SQL into the following file. cat <<EOF >/root/ibm-lh-dev/localstorage/volumes/infra/explain.sql explain (format graphviz) select * from iceberg_data.\"workshop\".customer where mktsegment='MACHINERY'; EOF Run the Presto command to generate the explain output. ./presto-cli --catalog iceberg_data --schema workshop --file /mnt/infra/explain.sql > /tmp/plan.dot Remove Headers. cat /tmp/plan.dot | sed 's/\"\"/\"/g' | sed -z 's/\"//' | sed '$s/\"//' > /tmp/fixedplan.dot Generate the PNG file from the explain statement. dot -Tpng /tmp/fixedplan.dot > /tmp/plan.png Open a separate terminal window and issue the following command (using the SSH port number and server name supplied in your reservation). Mac OSX user scp -port watsonx@region.techzone-server.com:/tmp/plan.png plan.png && open plan.png Windows user scp -port watsonx@region.techzone-server.com:/tmp/plan.png plan.png & start \"\" \"plan.png\" Linux user (watsonx.data server) eog /tmp/plan.png Joins and Aggregations This section will create an orders table to test joins and aggregations. Start Presto CLI with Workshop Schema. ./presto-cli --catalog iceberg_data --schema workshop Create the Orders Table. create table iceberg_data.workshop.orders as select * from tpch.tiny.orders; CREATE TABLE: 15000 rows Use a Windowing function. SELECT orderkey, clerk, totalprice, rank() OVER (PARTITION BY clerk ORDER BY totalprice DESC) AS rnk FROM orders ORDER BY clerk, rnk; Try to write a window function to show the custkey, orderdate, totalprice and priororder. The output should look like this. custkey | orderdate | totalprice | priororder ---------+------------+------------+------------ 1 | 1993-06-05 | 152411.41 | NULL 1 | 1993-08-13 | 83095.85 | 152411.41 1 | 1994-05-08 | 51134.82 | 83095.85 1 | 1995-10-29 | 165928.33 | 51134.82 1 | 1997-01-29 | 231040.44 | 165928.33 1 | 1997-03-04 | 270087.44 | 231040.44 1 | 1997-06-23 | 357345.46 | 270087.44 1 | 1997-11-18 | 28599.83 | 357345.46 1 | 1998-03-29 | 89230.03 | 28599.83 2 | 1993-02-19 | 170842.93 | 89230.03 2 | 1993-05-03 | 154867.09 | 170842.93 2 | 1993-09-30 | 143707.7 | 154867.09 2 | 1994-08-15 | 116247.57 | 143707.7 2 | 1994-12-29 | 45657.87 | 116247.57 2 | 1996-03-04 | 181875.6 | 45657.87 Prepared statements Save a query as a prepared statement. prepare customer_by_segment from select * from customer where mktsegment=?; Execute prepared statement using parameters. execute customer_by_segment using 'FURNITURE'; Note : This is only valid for the active session. Quit Presto. quit;","title":"Analytic Workloads"},{"location":"wxd-analytics/#analytic-workloads","text":"Watsonx.data is based on open source PrestoDB, a distributed query engine that enables querying data stored in open file formats using open table formats for optimization and performance. Some of the characteristics which you will learn and see in action include: Compute processing is performed in memory and in parallel. Data is pipelined between query stages and over the network reducing latency overhead that one would have if disk I/O were involved.","title":"Analytic Workloads"},{"location":"wxd-analytics/#executing-and-analyzing-analytic-workloads","text":"Let us start with some simple examples of running queries and analyze the execution. We can either use the dBeaver interface or the watsonx.data CLI. We will eventually be able to use the watsonx.data console UI as well but for the moment it is under construction.","title":"Executing and analyzing analytic workloads"},{"location":"wxd-analytics/#connect-to-watsonxdata","text":"Make sure you are the root user and change to the development directory. cd /root/ibm-lh-dev/bin Open the Presto CLI. Note : The workshop schema was created as part of the introduction to Minio. If you have not run that lab, the schema will not be available. Please see the Introduction to Minio section. ./presto-cli --catalog iceberg_data --schema workshop Run a simple scan query which selects customer names and market segment. select name, mktsegment from customer limit 3; name | mktsegment --------------------+------------ Customer#000000376 | AUTOMOBILE Customer#000000377 | MACHINERY Customer#000000378 | BUILDING (3 rows) To understand the query execution plan we use the explain statement. explain select name, mktsegment from customer; - Output[name, mktsegment] => [name:varchar, mktsegment:varchar] Estimates: {rows: 1500 (15.85kB), cpu: 16230.00, memory: 0.00, network: 16230.00} - RemoteStreamingExchange[GATHER] => [name:varchar, mktsegment:varchar] Estimates: {rows: 1500 (15.85kB), cpu: 16230.00, memory: 0.00, network: 16230.00} - TableScan[TableHandle {connectorId='iceberg_data', connectorHandle='workshop.customer$data@Optional[7053670466726060568]', layout='Optional[workshop.customer$data@Optional[7053670466726060568]]'}] => [name:varchar, mktsegment:varchar] Estimates: {rows: 1500 (15.85kB), cpu: 16230.00, memory: 0.00, network: 0.00} mktsegment := 7:mktsegment:varchar (1:38) name := 2:name:varchar (1:38) What you see above is the hierarchy of logical operations to execute the query. Explain the query and focus on IO operations. explain (type io) select name, mktsegment from customer; { \"inputTableColumnInfos\" : [ { \"table\" : { \"catalog\" : \"iceberg_data\", \"schemaTable\" : { \"schema\" : \"workshop\", \"table\" : \"customer\" } }, \"columnConstraints\" : [ ] } ] } Explain physical execution plan for the query. explain (type distributed) select name, mktsegment from customer; Fragment 0 [SINGLE] Output layout: [name, mktsegment] Output partitioning: SINGLE [] Stage Execution Strategy: UNGROUPED_EXECUTION - Output[name, mktsegment] => [name:varchar, mktsegment:varchar] Estimates: {rows: 1500 (15.85kB), cpu: 16230.00, memory: 0.00, network: 16230.00} - RemoteSource[1] => [name:varchar, mktsegment:varchar] Fragment 1 [SOURCE] Output layout: [name, mktsegment] Output partitioning: SINGLE [] Stage Execution Strategy: UNGROUPED_EXECUTION - TableScan[TableHandle {connectorId='iceberg_data', connectorHandle='workshop.customer$data@Optional[7053670466726060568]', layout='Optional[workshop.customer$data@Optional[7053670466726060568]]'}, grouped = false] => [name:varchar, mktsegment:varchar] Estimates: {rows: 1500 (15.85kB), cpu: 16230.00, memory: 0.00, network: 0.00} mktsegment := 7:mktsegment:varchar (1:57) name := 2:name:varchar (1:57) A fragment represents a stage of the distributed plan. The Presto scheduler schedules the execution by each stage, and stages can be run on separate instances. Create explain statement in a visual format. explain (format graphviz) select name, mktsegment from customer; digraph logical_plan { subgraph cluster_0 { label = \"SINGLE\" plannode_1[label=\"{Output[name, mktsegment]|Estimates: \\{rows: ? (?), cpu: ?, memory: ?, network: ?\\} }\", style=\"rounded, filled\", shape=record, fillcolor=white]; plannode_2[label=\"{ExchangeNode[GATHER]|name, mktsegment|Estimates: \\{rows: ? (?), cpu: ?, memory: ?, network: ?\\} }\", style=\"rounded, filled\", shape=record, fillcolor=gold]; plannode_3[label=\"{TableScan | [TableHandle \\{connectorId='iceberg_data', connectorHandle='workshop.customer$data@Optional[7053670466726060568]', layout='Optional[workshop.customer$data@Optional[7053670466726060568]]'\\}]|Estimates: \\{rows: ? (?), cpu: ?, memory: ?, network: ?\\} }\", style=\"rounded, filled\", shape=record, fillcolor=deepskyblue]; } plannode_1 -> plannode_2; plannode_2 -> plannode_3; } We are going to format the output from the explain statement and display it as a graphic. Quit Presto. quit; Place the explain SQL into a file that will be run as a script by Presto. cat <<EOF >/root/ibm-lh-dev/localstorage/volumes/infra/explain.sql explain (format graphviz) select name, mktsegment from customer; EOF Run Presto by pointing to the file with the SQL in it. ./presto-cli --catalog iceberg_data --schema workshop --file /mnt/infra/explain.sql > /tmp/plan.dot We need to get rid of headers and stuff that Presto generated when creating the output (there is no way to turn that off). cat /tmp/plan.dot | sed 's/\"\"/\"/g' | sed -z 's/\"//' | sed '$s/\"//' > /tmp/fixedplan.dot Generate the PNG file from the explain statement. dot -Tpng /tmp/fixedplan.dot > /tmp/plan.png Open a separate terminal window and issue the following command (using the SSH port number and server name supplied in your reservation).","title":"Connect to watsonx.data"},{"location":"wxd-analytics/#mac-osx-user","text":"scp -port watsonx@region.techzone-server.com:/tmp/plan.png plan.png && open plan.png","title":"Mac OSX user"},{"location":"wxd-analytics/#windows-user","text":"scp -port watsonx@region.techzone-server.com:/tmp/plan.png plan.png & start \"\" \"plan.png\"","title":"Windows user"},{"location":"wxd-analytics/#linux-user-watsonxdata-server","text":"eog /tmp/plan.png","title":"Linux user (watsonx.data server)"},{"location":"wxd-analytics/#creating-a-table-with-user-defined-partitions","text":"Connect to Presto with the Workshop Schema. ./presto-cli --catalog iceberg_data --schema workshop Create a partitioned table, based on column mktsegment and copy data from TPCH.TINY.CUSTOMER table. create table iceberg_data.workshop.part_customer with (partitioning = array['mktsegment']) as select * from tpch.tiny.customer; Quit Presto. quit;","title":"Creating a Table with User-defined Partitions"},{"location":"wxd-analytics/#inspect-object-store-directoryobjectfile-structure","text":"Open your browser and connect to the MinIO console. If you forget the userid and password, use the following command to extract them or use the passwords command. export LH_S3_ACCESS_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_ACCESS_KEY | sed 's/.*=//') export LH_S3_SECRET_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_SECRET_KEY | sed 's/.*=//') echo \"MinIO Userid : \" $LH_S3_ACCESS_KEY echo \"MinIO Password: \" $LH_S3_SECRET_KEY Click on the Object browser tab to show the current buckets in the MinIO system. Select iceberg-bucket. You will see two tables, customer and part_customer. Select part_customer. Then select data. Examining the part_customer, you will notice is the data is split into multiple parquet files stored across multiple directories - a single directory for each unique value of the partition key.","title":"Inspect object store directory/object/file structure"},{"location":"wxd-analytics/#predicate-query-to-utilize-partitions","text":"Connect to Presto with the Workshop Schema. ./presto-cli --catalog iceberg_data --schema workshop Now that have created a partitioned table, we will execute a SQL statement that will make use of this fact. select * from iceberg_data.\"workshop\".part_customer where mktsegment='MACHINERY'; custkey | name | address | nationkey | phone | acctbal | mktsegment | comment ---------+--------------------+------------------------------------------+-----------+-----------------+---------+------------+---------------------------------------------------------------------------------------------------------------------- 1131 | Customer#000001131 | KVAvB1lwuN qHWDDPNckenmRGULDFduxYRSBXv | 20 | 30-644-540-9044 | 6019.1 | MACHINERY | er the carefully dogged courts m 1133 | Customer#000001133 | FfA0o cMP02Ylzxtmbq8DCOq | 14 | 24-858-762-2348 | 5335.36 | MACHINERY | g to the pending, ironic pinto beans. furiously blithe packages are fina 1141 | Customer#000001141 | A6uzuXpgRPp19ek8K8zd5O | 22 | 32-330-618-9020 | 0.97 | MACHINERY | accounts. furiously pending deposits cajole. c 1149 | Customer#000001149 | 5JOAwCy8MD70TUZJDyxgEBMe | 3 | 13-254-242-3889 | 6287.79 | MACHINERY | ress requests haggle carefully across the fluffily regula 1150 | Customer#000001150 | fUJqzdkQg1 | 21 | 31-236-665-8430 | -117.31 | MACHINERY | usly final dolphins. fluffily bold platelets sleep. slyly unusual attainments lo 1155 | Customer#000001155 | kEDBn1IQWyHyYjgGGs6FiXfm3 | 8 | 18-864-953-3058 | 3510.25 | MACHINERY | ages? fluffily even accounts shall have to boost furiously alongside of the furiously pendin 1158 | Customer#000001158 | btAl2dQdvNV9cEzTwVRloTb08sLYKDopV2cK,p | 10 | 20-487-747-8857 | 3081.79 | MACHINERY | theodolites use stealthy asymptotes. frets integrate even instructions. car 1161 | Customer#000001161 | QD7s2P6QpCC6g9t2aVzKg7y | 19 | 29-213-663-3342 | 591.31 | MACHINERY | ly alongside of the quickly blithe ideas. quickly ironic accounts haggle regul 1165 | Customer#000001165 | h7KTXGSqsn0 | 9 | 19-766-409-6769 | 8177.33 | MACHINERY | jole slyly beside the quickly final accounts. silent, even requests are stealthily ironic, re 1166 | Customer#000001166 | W4FAGNPKcJFebzldtNp8SehhH3 | 17 | 27-869-223-7506 | 507.26 | MACHINERY | before the platelets! carefully bold ideas lose carefully 1169 | Customer#000001169 | 04YQNIYyRRFxUnJsTP36da | 4 | 14-975-169-9356 | 7503.3 | MACHINERY | into beans doubt about the slyly ironic multipliers. carefully regular requests breach theodolites. special packages 1188 | Customer#000001188 | PtwoF3jNQ9r6 GbPIelt GvbNBuDH | 15 | 25-108-989-8154 | 3698.86 | MACHINERY | ts. quickly unusual ideas affix aft 1190 | Customer#000001190 | JwzW9OtxFRXDnVo5hXl8 2A5VxH12 | 15 | 25-538-604-9042 | 2743.63 | MACHINERY | regular deposits according to the pending packages wake blithely among the silent inst 1203 | Customer#000001203 | 9pTq4gggfKoSqQetn0yJR | 16 | 26-370-660-6154 | 5787.69 | MACHINERY | osits nag furiously final accounts. silent pack ... Many more rows Due to the partitioning of this table by mktsegment , it will completely skip scanning a large percentage of the objects in the object store. We run an explain against this query using the following command. explain (format graphviz) select * from iceberg_data.\"workshop\".customer where mktsegment='MACHINERY'; Query Plan ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- digraph logical_plan { subgraph cluster_0 { label = \"SINGLE\" plannode_1[label=\"{Output[custkey, name, address, nationkey, phone, acctbal, mktsegment, comment]|Estimates: \\{rows: 750 (56.84kB), cpu: 232830.00, memory: 0.00, network: 58207.50\\} }\", style=\"rounded, filled\", shape=record, fillcolor=white]; plannode_2[label=\"{ExchangeNode[GATHER]|custkey, name, address, nationkey, phone, acctbal, mktsegment, comment|Estimates: \\{rows: 750 (56.84kB), cpu: 232830.00, memory: 0.00, network: 58207.50\\} }\", style=\"rounded, filled\", shape=record, fillcolor=gold]; plannode_3[label=\"{Filter|(mktsegment) = (VARCHAR'MACHINERY')|Estimates: \\{rows: 750 (56.84kB), cpu: 232830.00, memory: 0.00, network: 0.00\\} }\", style=\"rounded, filled\", shape=record, fillcolor=yellow]; plannode_4[label=\"{TableScan | [TableHandle \\{connectorId='iceberg_data', connectorHandle='workshop.customer$data@Optional[7230522396120575591]', layout='Optional[workshop.customer$data@Optional[7230522396120575591]]'\\}]|Estimates: \\{rows: 1500 (113.69kB), cpu: 116415.00, memory: 0.00, network: 0.00\\} }\", style=\"rounded, filled\", shape=record, fillcolor=deepskyblue]; } plannode_1 -> plannode_2; plannode_2 -> plannode_3; plannode_3 -> plannode_4; } To visualize this, we are going to run this command and place the results into a temporary file. Exit Presto. quit; Place the explain SQL into the following file. cat <<EOF >/root/ibm-lh-dev/localstorage/volumes/infra/explain.sql explain (format graphviz) select * from iceberg_data.\"workshop\".customer where mktsegment='MACHINERY'; EOF Run the Presto command to generate the explain output. ./presto-cli --catalog iceberg_data --schema workshop --file /mnt/infra/explain.sql > /tmp/plan.dot Remove Headers. cat /tmp/plan.dot | sed 's/\"\"/\"/g' | sed -z 's/\"//' | sed '$s/\"//' > /tmp/fixedplan.dot Generate the PNG file from the explain statement. dot -Tpng /tmp/fixedplan.dot > /tmp/plan.png Open a separate terminal window and issue the following command (using the SSH port number and server name supplied in your reservation).","title":"Predicate query to utilize partitions"},{"location":"wxd-analytics/#mac-osx-user_1","text":"scp -port watsonx@region.techzone-server.com:/tmp/plan.png plan.png && open plan.png","title":"Mac OSX user"},{"location":"wxd-analytics/#windows-user_1","text":"scp -port watsonx@region.techzone-server.com:/tmp/plan.png plan.png & start \"\" \"plan.png\"","title":"Windows user"},{"location":"wxd-analytics/#linux-user-watsonxdata-server_1","text":"eog /tmp/plan.png","title":"Linux user (watsonx.data server)"},{"location":"wxd-analytics/#joins-and-aggregations","text":"This section will create an orders table to test joins and aggregations. Start Presto CLI with Workshop Schema. ./presto-cli --catalog iceberg_data --schema workshop Create the Orders Table. create table iceberg_data.workshop.orders as select * from tpch.tiny.orders; CREATE TABLE: 15000 rows Use a Windowing function. SELECT orderkey, clerk, totalprice, rank() OVER (PARTITION BY clerk ORDER BY totalprice DESC) AS rnk FROM orders ORDER BY clerk, rnk; Try to write a window function to show the custkey, orderdate, totalprice and priororder. The output should look like this. custkey | orderdate | totalprice | priororder ---------+------------+------------+------------ 1 | 1993-06-05 | 152411.41 | NULL 1 | 1993-08-13 | 83095.85 | 152411.41 1 | 1994-05-08 | 51134.82 | 83095.85 1 | 1995-10-29 | 165928.33 | 51134.82 1 | 1997-01-29 | 231040.44 | 165928.33 1 | 1997-03-04 | 270087.44 | 231040.44 1 | 1997-06-23 | 357345.46 | 270087.44 1 | 1997-11-18 | 28599.83 | 357345.46 1 | 1998-03-29 | 89230.03 | 28599.83 2 | 1993-02-19 | 170842.93 | 89230.03 2 | 1993-05-03 | 154867.09 | 170842.93 2 | 1993-09-30 | 143707.7 | 154867.09 2 | 1994-08-15 | 116247.57 | 143707.7 2 | 1994-12-29 | 45657.87 | 116247.57 2 | 1996-03-04 | 181875.6 | 45657.87","title":"Joins and Aggregations"},{"location":"wxd-analytics/#prepared-statements","text":"Save a query as a prepared statement. prepare customer_by_segment from select * from customer where mktsegment=?; Execute prepared statement using parameters. execute customer_by_segment using 'FURNITURE'; Note : This is only valid for the active session. Quit Presto. quit;","title":"Prepared statements"},{"location":"wxd-certificate/","text":"Watsonx.data Certificates Watsonx.data Certificate Failure Due to a change in TechZone URLs, the self-signed certificates in the watsonx.data Developer image may be invalid. If you are attempting to connect to the watsonx.data system from outside the virtual machine, you will need to run the following commands to fix the self-signed certificate. Step 1: Connect to the Server Use the SSH port to connect into the server and make sure that you become the root user. sudo su - Step 2: Update the Certificate We need to update the certificate by using a utility in the developer toolbox. Start the toolbox code by switching to the bin directory and issuing the following command. cd /root/ibm-lh-dev/bin ./dev-sandbox Once inside the development container, you will need to update the program that generates the certificates. Note : The certificate should cover all TechZone locations. If for some reason your TechZone server does not match the pattern *.services.cloud.techzone.ibm.com , update it in the command below. sed -i '/DNS.14.*/a DNS.15 = watsonxdata' /scripts/gen_certs.sh sed -i '/DNS.15.*/a DNS.16 = watsonxdata.gym.lan' /scripts/gen_certs.sh sed -i '/DNS.16.*/a DNS.17 = *.services.cloud.techzone.ibm.com' /scripts/gen_certs.sh ./scripts/gen_certs.sh Once the script completes, exit the toolkit. exit Step 3: Stop and Restart the System The certificates need to be replaced in all the running containers. You must stop and restart them. You must include the diagnostic flag or else the system will not work properly. The startup will take some time to complete. The Postgres pod will display some warning messages which can be safely ignored. ./stop.sh export LH_RUN_MODE = diag ./start.sh Step 4: Generate Custom Certificate The first step is to copy the new certificates to the central /certs directory use by this image. docker cp ibm-lh-presto:/mnt/infra/tls/lh-ssl-ts.jks /certs/lh-ssl-ts.jks docker cp ibm-lh-presto:/mnt/infra/tls/cert.crt /certs/lh-ssl-ts.crt Next we need to generate the certificate file that is used by a number of the examples in the lab instructions. rm -f presto.crt echo QUIT | openssl s_client -showcerts -connect 127 .0.0.1:8443 | awk '/-----BEGIN CERTIFICATE-----/ {p=1}; p; /-----END CERTIFICATE-----/ {p=0}' > presto.crt You can print the certificate if you need it for connections from CP4D. cat presto.crt Step 5: Generate Java Keystore File The next step will create the Java Keystore file. When prompted, use a password of watsonx.data and say yes to accepting the certificate. Make sure that you see your host in the list. For instance, useast.services.cloud.techzone.ibm.com should be displayed when you see the results. rm -f presto-key.jks keytool -import -alias presto-crt -file ./presto.crt -keystore ./presto-key.jks The following is an example of the output from the keytool command. Owner: CN=Dummy-Self-signed-Cert, EMAILADDRESS=dummy@example.dum, OU=For-CPD, O=Data and AI, L=Home-Town, ST=XX, C=YY Issuer: CN=Dummy-Self-signed-Cert, EMAILADDRESS=dummy@example.dum, OU=For-CPD, O=Data and AI, L=Home-Town, ST=XX, C=YY Serial number: 73f26644ad83ac8cdf9afbda6006d4e52f244fac Valid from: Tue Mar 05 17:42:56 EST 2024 until: Wed May 23 18:42:56 EDT 2035 Certificate fingerprints: SHA1: 3A:6C:52:80:3D:14:CF:D0:E7:AC:14:13:6F:46:FB:B1:8C:BA:E4:37 SHA256: 28:E7:AD:4E:BA:5F:00:4C:B7:2E:61:3E:3B:96:E5:DF:01:D5:80:CE:1A:B3:EF:B7:86:11:26:4A:B6:7C:90:8A Signature algorithm name: SHA512withRSA Subject Public Key Algorithm: 2048-bit RSA key Version: 3 Extensions: #1: ObjectId: 2.5.29.37 Criticality=false ExtendedKeyUsages [ serverAuth ] #2: ObjectId: 2.5.29.17 Criticality=false SubjectAlternativeName [ DNSName: ibm-lh-presto-svc DNSName: *.svc.cluster.local DNSName: api-svc DNSName: *.api DNSName: localhost DNSName: ibm-lh-hive-metastore DNSName: ibm-lh-hive-metastore-svc DNSName: lhconsole-api-svc DNSName: lhconsole-nodeclient-svc DNSName: ibm-lh-ranger-svc DNSName: ibm-lh-javaapi-svc DNSName: ibm-lh-prestissimo-svc DNSName: ibm-lh-qhmm DNSName: ibm-lh-qhmm-svc DNSName: watsonxdata DNSName: watsonxdata.gym.lan DNSName: *.services.cloud.techzone.ibm.com ] Trust this certificate? [no]: yes Certificate was added to keystore Step 6: Create Certificate and Keystore Copies The final step is to copy the certs and keystore values in a central location so they can be used in various scripts and notebooks. \\c p -f presto-key.jks /certs \\c p -f presto.crt /certs chmod +r /certs/*.* \\c p -rf /certs /notebooks/","title":"Watsonx.data Certificates"},{"location":"wxd-certificate/#watsonxdata-certificates","text":"Watsonx.data Certificate Failure Due to a change in TechZone URLs, the self-signed certificates in the watsonx.data Developer image may be invalid. If you are attempting to connect to the watsonx.data system from outside the virtual machine, you will need to run the following commands to fix the self-signed certificate.","title":"Watsonx.data Certificates"},{"location":"wxd-certificate/#step-1-connect-to-the-server","text":"Use the SSH port to connect into the server and make sure that you become the root user. sudo su -","title":"Step 1: Connect to the Server"},{"location":"wxd-certificate/#step-2-update-the-certificate","text":"We need to update the certificate by using a utility in the developer toolbox. Start the toolbox code by switching to the bin directory and issuing the following command. cd /root/ibm-lh-dev/bin ./dev-sandbox Once inside the development container, you will need to update the program that generates the certificates. Note : The certificate should cover all TechZone locations. If for some reason your TechZone server does not match the pattern *.services.cloud.techzone.ibm.com , update it in the command below. sed -i '/DNS.14.*/a DNS.15 = watsonxdata' /scripts/gen_certs.sh sed -i '/DNS.15.*/a DNS.16 = watsonxdata.gym.lan' /scripts/gen_certs.sh sed -i '/DNS.16.*/a DNS.17 = *.services.cloud.techzone.ibm.com' /scripts/gen_certs.sh ./scripts/gen_certs.sh Once the script completes, exit the toolkit. exit","title":"Step 2: Update the Certificate"},{"location":"wxd-certificate/#step-3-stop-and-restart-the-system","text":"The certificates need to be replaced in all the running containers. You must stop and restart them. You must include the diagnostic flag or else the system will not work properly. The startup will take some time to complete. The Postgres pod will display some warning messages which can be safely ignored. ./stop.sh export LH_RUN_MODE = diag ./start.sh","title":"Step 3: Stop and Restart the System"},{"location":"wxd-certificate/#step-4-generate-custom-certificate","text":"The first step is to copy the new certificates to the central /certs directory use by this image. docker cp ibm-lh-presto:/mnt/infra/tls/lh-ssl-ts.jks /certs/lh-ssl-ts.jks docker cp ibm-lh-presto:/mnt/infra/tls/cert.crt /certs/lh-ssl-ts.crt Next we need to generate the certificate file that is used by a number of the examples in the lab instructions. rm -f presto.crt echo QUIT | openssl s_client -showcerts -connect 127 .0.0.1:8443 | awk '/-----BEGIN CERTIFICATE-----/ {p=1}; p; /-----END CERTIFICATE-----/ {p=0}' > presto.crt You can print the certificate if you need it for connections from CP4D. cat presto.crt","title":"Step 4: Generate Custom Certificate"},{"location":"wxd-certificate/#step-5-generate-java-keystore-file","text":"The next step will create the Java Keystore file. When prompted, use a password of watsonx.data and say yes to accepting the certificate. Make sure that you see your host in the list. For instance, useast.services.cloud.techzone.ibm.com should be displayed when you see the results. rm -f presto-key.jks keytool -import -alias presto-crt -file ./presto.crt -keystore ./presto-key.jks The following is an example of the output from the keytool command. Owner: CN=Dummy-Self-signed-Cert, EMAILADDRESS=dummy@example.dum, OU=For-CPD, O=Data and AI, L=Home-Town, ST=XX, C=YY Issuer: CN=Dummy-Self-signed-Cert, EMAILADDRESS=dummy@example.dum, OU=For-CPD, O=Data and AI, L=Home-Town, ST=XX, C=YY Serial number: 73f26644ad83ac8cdf9afbda6006d4e52f244fac Valid from: Tue Mar 05 17:42:56 EST 2024 until: Wed May 23 18:42:56 EDT 2035 Certificate fingerprints: SHA1: 3A:6C:52:80:3D:14:CF:D0:E7:AC:14:13:6F:46:FB:B1:8C:BA:E4:37 SHA256: 28:E7:AD:4E:BA:5F:00:4C:B7:2E:61:3E:3B:96:E5:DF:01:D5:80:CE:1A:B3:EF:B7:86:11:26:4A:B6:7C:90:8A Signature algorithm name: SHA512withRSA Subject Public Key Algorithm: 2048-bit RSA key Version: 3 Extensions: #1: ObjectId: 2.5.29.37 Criticality=false ExtendedKeyUsages [ serverAuth ] #2: ObjectId: 2.5.29.17 Criticality=false SubjectAlternativeName [ DNSName: ibm-lh-presto-svc DNSName: *.svc.cluster.local DNSName: api-svc DNSName: *.api DNSName: localhost DNSName: ibm-lh-hive-metastore DNSName: ibm-lh-hive-metastore-svc DNSName: lhconsole-api-svc DNSName: lhconsole-nodeclient-svc DNSName: ibm-lh-ranger-svc DNSName: ibm-lh-javaapi-svc DNSName: ibm-lh-prestissimo-svc DNSName: ibm-lh-qhmm DNSName: ibm-lh-qhmm-svc DNSName: watsonxdata DNSName: watsonxdata.gym.lan DNSName: *.services.cloud.techzone.ibm.com ] Trust this certificate? [no]: yes Certificate was added to keystore","title":"Step 5: Generate Java Keystore File"},{"location":"wxd-certificate/#step-6-create-certificate-and-keystore-copies","text":"The final step is to copy the certs and keystore values in a central location so they can be used in various scripts and notebooks. \\c p -f presto-key.jks /certs \\c p -f presto.crt /certs chmod +r /certs/*.* \\c p -rf /certs /notebooks/","title":"Step 6: Create Certificate and Keystore Copies"},{"location":"wxd-connections/","text":"Database Connections There are three database systems that can be accessed inside and outside the virtual machine environment: watsonx.data Presto, Db2 LUW, MySQL and PostgreSQL. In order to access these images outside the Virtual machine image, you need the server name and port for the service. You will also need to download the presto-key.jks file for connecting to Presto. Connection Certificate Accessing watsonx.data (Presto) Accessing Db2 Accessing PostgreSQL Accessing MySQL Adding a database to watsonx.data Accessing watsonx.data via Python Accessing watsonx.data via Pandas Dataframes Generating a Certificate Adding a Service Watsonx.data Connection Certificate When connecting to the watsonx.data Presto database, you will need to have the connection certificate available to the client that you are using. Usually this location is your workstation, but it could be another service like CP4D. To extract the certificate to your local file system, use the following command in a terminal window. Replace the port and regions.techzone-server.com with the SSH values found in the TechZone reservation. scp -P port watsonx@region.techzone-server.com:/certs/presto-key.jks /Users/myname/Downloads Change the target directory to a location that you can remember! You can also download the certificate by using the Jupyter Notebook link and opening the Credentials notebook. There you will find links to the certificates. Watsonx.data Presto Access When connecting to the Presto engine, choose the PrestoDB driver. Presto Internal Access For local access the following credentials are used: Hostname: localhost Port: 8443 Username: ibmlhadmin Password: password Database: tpch In addition, you need to set the following driver properties: SSL True SSLTrustStorePath /certs/presto-key.jks SSLTrustStorePassword watsonx.data Presto External Access The watsonx.data Presto database requires that the certificate be extracted from the image. See the section above on Connection Certificate for more details. In the following settings, remember to update the Hostname and Port to the values provided in your TechZone reservation. The database connection settings are: Hostname: region.techzone-server.com Port: port Username: ibmlhadmin Password: password Database: tpch In addition, you need to set the following driver properties: SSL True SSLTrustStorePath /mydownload/presto-key.jks SSLTrustStorePassword watsonx.data Note : The /mydownload/presto-key.jks value needs to be replaced with the location that you copied the key in the earlier step. Db2 Access When connecting to the Db2 engine, select the Db2 LUW driver. Db2 Internal Access The Db2 server can be accessed on port 50000 inside the virtual machine using the following credentials: Hostname - watsonxdata Port - 50000 Username - db2inst1 Password - db2inst1 Database - gosales SSL - off Db2 External Access When accessing the database outside the virtual machine, you must change the host to region.techzone-server.com and the port number based on your TechZone reservation. All the other settings remain the same. Hostname - region.techzone-server.com Port - port Username - db2inst1 Password - db2inst1 Database - gosales SSL - off PostgreSQL Access When connecting to the PostgreSQL engine, select the PostgreSQL driver. In order to connect to the PostgreSQL system, you will need to extract the admin password using the following command when connected to the watsonx.data system. cat /certs/passwords You can also retrieve the credentials by opening up the Credentials notebook in the Jupyter notebook service. PostgreSQL Internal Access When accessing the PostgreSQL database in the system, use the following settings. Hostname \u2013 ibm-lh-postgres Port \u2013 5432 Username \u2013 admin Password \u2013 The value that was extracted in the earlier step Database \u2013 gosales PostgreSQL External Access The following credentials are used for remote access. Hostname: regions.techzone-server.com Port: port Username: admin Password: The value that was extracted in the earlier step Database name: gosales MySQL Access When connecting to the MySQL engine, select the MySQL driver. export POSTGRES_PASSWORD=$(docker exec ibm-lh-postgres printenv | grep POSTGRES_PASSWORD | sed 's/.*=//') echo \"Postgres Userid : admin\" echo \"Postgres Password : \" $POSTGRES_PASSWORD echo $POSTGRES_PASSWORD > /tmp/postgres.pw MySQL Internal Access When accessing the MySQL database in the system, use the following settings. Hostname \u2013 watsonxdata Port \u2013 3306 Username \u2013 root Password - password Database \u2013 gosalesdw You must set the allowPublicKeyRetrieval to True for the connection to work with dBeaver. MySQL External Access The following credentials are used for remote access. Hostname: regions.techzone-server.com Port: port Username: root Password - password Database name: gosalesdw You must set the allowPublicKeyRetrieval to True for the connection to work with dBeaver (see above). Adding a Database to watsonx.data When adding a database engine to the watsonx.data system, make sure to change the database display name since that needs to be unique. For instance, when you add gosales database from Db2 to the system, the display name could be gosales as well. However, if you now add the PostgreSQL database to the system, the display name cannot be the same. You may want to differentiate databases with the same name by prefixing them with the database type. For instance, the gosales database could be shown as db2_gosales or pg_gosales so that you keep the names distinct. Once a database has been added, make sure to wait for a few moments before attempting to access the database. The Presto server takes a few moments to start up. To make sure that it is running, run the check_presto command in a terminal window and wait until it says the service is ready. When attempting to view the contents of a new database, the process may take a few minutes to complete. Refresh the browser window if you haven't seen any changes to the display. Accessing watsonx.data via Python In order to access the watsonx.data database (Presto), you will need to install the Presto client using the following command on your local machine. pip3 install presto-python-client Once the installation is complete, extract the certificate from the watsonx.data server that we will use in the connection. scp -P port watsonx@region.techzone-server.com:/certs/presto-key.jks /Users/myname/Downloads Change the target directory to a location that you can remember! You can also download the certificate by using the Jupyter Notebook link and opening the Certificate notebook. Python and Jupyter Connection Code Your Python or Jupyter notebook code will need to import the prestodb library and then connect to watsonx.data using the connect call. import prestodb conn = prestodb.dbapi.connect( host='watsonxdata', port=8443, user='ibmlhadmin', catalog='tpch', schema='tiny', http_scheme='https', auth=prestodb.auth.BasicAuthentication(\"ibmlhadmin\", \"password\") ) conn._http_session.verify = '/certs/lh-ssl-ts.crt' cur = conn.cursor() In the above connection string, you will need to replace the following values: host - watsonxdata when connecting to the image externally, and ibm-lh-presto-svc when connecting internally catalog - What is the name of the catalog that we are accessing schema - The schema inside the catalog that will be used You also need to update the conn._http_session.verify value with the location where you downloaded the lh-ssl-ts.crt file. For internal connections, this value will be /certs/lh-ssl-ts.crt . Once connected, you can run an SQL statement and return the results. cur.execute(\"SELECT * FROM tpch.tiny.customer\") rows = cur.fetchall() The rows variable contains the answer set from the select statement. You can manipulate the row variable to view the results. rows[0] [1, 'Customer#000000001', 'IVhzIApeRb ot,c,E', 15, '25-989-741-2988', 711.56, 'BUILDING', 'to the even, regular platelets. regular, ironic epitaphs nag e'] The PrestoDB driver supports the DBAPI spec. For more details on the use of the DBAPI interface, please refer to https://peps.python.org/pep-0249/ . For instance, if you want to find the description of the columns returned, you would use the description function. cur.description [('custkey', 'bigint', None, None, None, None, None), ('name', 'varchar(25)', None, None, None, None, None), ('address', 'varchar(40)', None, None, None, None, None), ('nationkey', 'bigint', None, None, None, None, None), ('phone', 'varchar(15)', None, None, None, None, None), ('acctbal', 'double', None, None, None, None, None), ('mktsegment', 'varchar(10)', None, None, None, None, None), ('comment', 'varchar(117)', None, None, None, None, None)] Accessing watsonx.data via Pandas Dataframes The following code is required for accessing watsonx.data in Jupyter notebooks. Run the following code inside a notebook code cell. %pip install ipython-sql==0.4.1 %pip install sqlalchemy==1.4.46 %pip install sqlalchemy==1.4.46 \"pyhive[presto]\" The notebook may need a restart of the kernel to pick up the changes to the driver. If you are running in a Jupyter Lab environment, you can use the most current versions of the drivers. %pip install ipython-sql %pip install sqlalchemy %pip install sqlalchemy \"pyhive[presto]\" Once the drivers have been loaded, you will need to extract the certificate from the watsonx.data server that we will use in the connection. scp -P port watsonx@region.techzone-server.com:/certs/presto-key.jks /Users/myname/Downloads Change the target directory to a location that you can remember! You can also download the certificate by using the Jupyter Notebook link and opening the Certificate notebook. In your Jupyter notebook, you will need to import a number of libraries. import pandas as pd import sqlalchemy from sqlalchemy import create_engine Create a notebook cell which will contain all the credentials that are required to connect. Change the catalog , schema and certfile to your values. userid = \"ibmlhadmin\" password = \"password\" hostname = \"watsonxdata\" port = \"8443\" catalog = \"tpch\" schema = \"tiny\" certfile = \"/certs/lh-ssl-ts.crt\" connect_args={ 'protocol': 'https', 'requests_kwargs': {'verify': f'{certfile}'} } In the above settings, you will need to replace the following values: hostname - region.techzone-server.com when connecting to the image externally, and ibm-lh-presto-svc when connecting internally catalog - What is the name of the catalog that we are accessing schema - The schema inside the catalog that will be used You also need to update the certfile value with the location where you downloaded the lh-ssl-ts.crt file. For internal connections, this value will be /certs/lh-ssl-ts.crt . To create a connection to the database, use the following syntax. engine = create_engine( f\"presto://{userid}:{password}@{hostname}:{port}/{catalog}/{schema}\", connect_args=connect_args ) Now that you have established a connection, you can use the Pandas read_sql_query function to execute a SELECT statement against the database. mypresto = pd.read_sql_query('SELECT * from tpch.tiny.customer',engine) The variable mypresto contains the dataframe generated from the SELECT statement. mypresto You can use the features of Pandas to generate plots of the data in your notebook. First make sure you have matplotlib installed. %pip install matplotlib The following query will compute the total account balance across all nation key values. sumbynation = pd.read_sql_query('SELECT \"nationkey\", sum(\"acctbal\") from tpch.tiny.customer group by \"nationkey\" order by 2',engine) Finally, we plot the results. df.plot(kind=\"bar\", x=\"FirstName\", y=\"LastName\") plt.show() Adding a Service The watsonx.data developer edition includes two open ports which can be used to externalize a service that you create in the image. For instance, you may choose to create a MongoDB or MSSQL container using Docker and want to access this service from your own dBeaver or Mongo tooling. Since port numbers vary between different databases, the watsonx.data system provides two port numbers that can be used by your service. Open Port 1 - Server: region.techzone-services.com: Port: 12345 Open Port 2 - Server: region.techzone-services.com: Port: 23456 The internal port numbers are 10000 (Port 1) and 10001 (Port 2). The following steps are required to use these ports with your service. Open the local Firewall (Version 1.1.0 Only) Ports 10000/1 are not open by default in the 1.1.0 image. For release 1.1.1, you can skip this step. You must explicitly open ports 10000/1 with the firewall-cmd command. In a command line shell, as the root user, enter the following commands: sudo su - firewall-cmd --add-port={10000/tcp,10001/tcp} --zone=public --permanent firewall-cmd --reload You can use the following command to check that the ports are now open. firewall-cmd --list-ports Create your Service When creating your service, make sure to map the internal Docker port to either port 10000 or 10001. If you cannot remap the port, see the section on port redirection. For instance, the following command will start Microsoft SQLServer in Docker by mapping the host port 10000 to the SQLServer port of 1433 to 10000 ( -p 10000:1443 ). docker run -e \"ACCEPT_EULA=Y\" -e \"MSSQL_SA_PASSWORD=Passw0rd12345678!\" \\ -p 10000:1433 --name mssql-server --hostname mssql-server \\ -d mcr.microsoft.com/mssql/server:2019-latest You can check the port mapping with the following command. docker container ls --format \"table {{.ID}}\\t{{.Names}}\\t{{.Ports}}\" -a | grep mssql-server When creating a connection to this database using an external tool, make sure to use the port number supplied in the reservation details (Open Port 1 is for port 10000 and Open Port 2 is for port 10001). Port Redirection If you already have an existing service mapped to a different port, you can use port redirection to use either port 10000 or 10001. For instance, assume that the previous creation of the SQLServer database used port 1433. docker run -e \"ACCEPT_EULA=Y\" -e \"MSSQL_SA_PASSWORD=Passw0rd12345678!\" \\ -p 1433:1433 --name mssql-server --hostname mssql-server \\ -d mcr.microsoft.com/mssql/server:2019-latest Once the service is up and running, you can redirect the traffic on port 10000/1 to the port of your docker service. firewall-cmd --add-forward-port=port=10000:proto=tcp:toport=1433 --permanent --zone=public firewall-cmd --reload If you need to remove the redirection, use the following command. firewall-cmd --remove-forward-port=port=10000:proto=tcp:toport=1433:toaddr= --permanent --zone=public firewall-cmd --reload Accessing your Service When referring to your service from an external location, always use the port numbers that are provided for Open Port 1 or 2. Open Port 1 - Server: region.techzone-services.com: Port: 12345 Open Port 2 - Server: region.techzone-services.com: Port: 23456 Your server will be region.techzone-services.com and the port number will be either of the two port numbers provided. Remember that this port number will need to be opened in the server and a Docker mapping to the open port or a firewall port redirection will be required.","title":"Database Connections"},{"location":"wxd-connections/#database-connections","text":"There are three database systems that can be accessed inside and outside the virtual machine environment: watsonx.data Presto, Db2 LUW, MySQL and PostgreSQL. In order to access these images outside the Virtual machine image, you need the server name and port for the service. You will also need to download the presto-key.jks file for connecting to Presto. Connection Certificate Accessing watsonx.data (Presto) Accessing Db2 Accessing PostgreSQL Accessing MySQL Adding a database to watsonx.data Accessing watsonx.data via Python Accessing watsonx.data via Pandas Dataframes Generating a Certificate Adding a Service","title":"Database Connections"},{"location":"wxd-connections/#watsonxdata-connection-certificate","text":"When connecting to the watsonx.data Presto database, you will need to have the connection certificate available to the client that you are using. Usually this location is your workstation, but it could be another service like CP4D. To extract the certificate to your local file system, use the following command in a terminal window. Replace the port and regions.techzone-server.com with the SSH values found in the TechZone reservation. scp -P port watsonx@region.techzone-server.com:/certs/presto-key.jks /Users/myname/Downloads Change the target directory to a location that you can remember! You can also download the certificate by using the Jupyter Notebook link and opening the Credentials notebook. There you will find links to the certificates.","title":"Watsonx.data Connection Certificate"},{"location":"wxd-connections/#watsonxdata-presto-access","text":"When connecting to the Presto engine, choose the PrestoDB driver.","title":"Watsonx.data Presto Access"},{"location":"wxd-connections/#presto-internal-access","text":"For local access the following credentials are used: Hostname: localhost Port: 8443 Username: ibmlhadmin Password: password Database: tpch In addition, you need to set the following driver properties: SSL True SSLTrustStorePath /certs/presto-key.jks SSLTrustStorePassword watsonx.data","title":"Presto Internal Access"},{"location":"wxd-connections/#presto-external-access","text":"The watsonx.data Presto database requires that the certificate be extracted from the image. See the section above on Connection Certificate for more details. In the following settings, remember to update the Hostname and Port to the values provided in your TechZone reservation. The database connection settings are: Hostname: region.techzone-server.com Port: port Username: ibmlhadmin Password: password Database: tpch In addition, you need to set the following driver properties: SSL True SSLTrustStorePath /mydownload/presto-key.jks SSLTrustStorePassword watsonx.data Note : The /mydownload/presto-key.jks value needs to be replaced with the location that you copied the key in the earlier step.","title":"Presto External Access"},{"location":"wxd-connections/#db2-access","text":"When connecting to the Db2 engine, select the Db2 LUW driver.","title":"Db2 Access"},{"location":"wxd-connections/#db2-internal-access","text":"The Db2 server can be accessed on port 50000 inside the virtual machine using the following credentials: Hostname - watsonxdata Port - 50000 Username - db2inst1 Password - db2inst1 Database - gosales SSL - off","title":"Db2 Internal Access"},{"location":"wxd-connections/#db2-external-access","text":"When accessing the database outside the virtual machine, you must change the host to region.techzone-server.com and the port number based on your TechZone reservation. All the other settings remain the same. Hostname - region.techzone-server.com Port - port Username - db2inst1 Password - db2inst1 Database - gosales SSL - off","title":"Db2 External Access"},{"location":"wxd-connections/#postgresql-access","text":"When connecting to the PostgreSQL engine, select the PostgreSQL driver. In order to connect to the PostgreSQL system, you will need to extract the admin password using the following command when connected to the watsonx.data system. cat /certs/passwords You can also retrieve the credentials by opening up the Credentials notebook in the Jupyter notebook service.","title":"PostgreSQL Access"},{"location":"wxd-connections/#postgresql-internal-access","text":"When accessing the PostgreSQL database in the system, use the following settings. Hostname \u2013 ibm-lh-postgres Port \u2013 5432 Username \u2013 admin Password \u2013 The value that was extracted in the earlier step Database \u2013 gosales","title":"PostgreSQL Internal Access"},{"location":"wxd-connections/#postgresql-external-access","text":"The following credentials are used for remote access. Hostname: regions.techzone-server.com Port: port Username: admin Password: The value that was extracted in the earlier step Database name: gosales","title":"PostgreSQL External Access"},{"location":"wxd-connections/#mysql-access","text":"When connecting to the MySQL engine, select the MySQL driver. export POSTGRES_PASSWORD=$(docker exec ibm-lh-postgres printenv | grep POSTGRES_PASSWORD | sed 's/.*=//') echo \"Postgres Userid : admin\" echo \"Postgres Password : \" $POSTGRES_PASSWORD echo $POSTGRES_PASSWORD > /tmp/postgres.pw","title":"MySQL Access"},{"location":"wxd-connections/#mysql-internal-access","text":"When accessing the MySQL database in the system, use the following settings. Hostname \u2013 watsonxdata Port \u2013 3306 Username \u2013 root Password - password Database \u2013 gosalesdw You must set the allowPublicKeyRetrieval to True for the connection to work with dBeaver.","title":"MySQL Internal Access"},{"location":"wxd-connections/#mysql-external-access","text":"The following credentials are used for remote access. Hostname: regions.techzone-server.com Port: port Username: root Password - password Database name: gosalesdw You must set the allowPublicKeyRetrieval to True for the connection to work with dBeaver (see above).","title":"MySQL External Access"},{"location":"wxd-connections/#adding-a-database-to-watsonxdata","text":"When adding a database engine to the watsonx.data system, make sure to change the database display name since that needs to be unique. For instance, when you add gosales database from Db2 to the system, the display name could be gosales as well. However, if you now add the PostgreSQL database to the system, the display name cannot be the same. You may want to differentiate databases with the same name by prefixing them with the database type. For instance, the gosales database could be shown as db2_gosales or pg_gosales so that you keep the names distinct. Once a database has been added, make sure to wait for a few moments before attempting to access the database. The Presto server takes a few moments to start up. To make sure that it is running, run the check_presto command in a terminal window and wait until it says the service is ready. When attempting to view the contents of a new database, the process may take a few minutes to complete. Refresh the browser window if you haven't seen any changes to the display.","title":"Adding a Database to watsonx.data"},{"location":"wxd-connections/#accessing-watsonxdata-via-python","text":"In order to access the watsonx.data database (Presto), you will need to install the Presto client using the following command on your local machine. pip3 install presto-python-client Once the installation is complete, extract the certificate from the watsonx.data server that we will use in the connection. scp -P port watsonx@region.techzone-server.com:/certs/presto-key.jks /Users/myname/Downloads Change the target directory to a location that you can remember! You can also download the certificate by using the Jupyter Notebook link and opening the Certificate notebook.","title":"Accessing watsonx.data via Python"},{"location":"wxd-connections/#python-and-jupyter-connection-code","text":"Your Python or Jupyter notebook code will need to import the prestodb library and then connect to watsonx.data using the connect call. import prestodb conn = prestodb.dbapi.connect( host='watsonxdata', port=8443, user='ibmlhadmin', catalog='tpch', schema='tiny', http_scheme='https', auth=prestodb.auth.BasicAuthentication(\"ibmlhadmin\", \"password\") ) conn._http_session.verify = '/certs/lh-ssl-ts.crt' cur = conn.cursor() In the above connection string, you will need to replace the following values: host - watsonxdata when connecting to the image externally, and ibm-lh-presto-svc when connecting internally catalog - What is the name of the catalog that we are accessing schema - The schema inside the catalog that will be used You also need to update the conn._http_session.verify value with the location where you downloaded the lh-ssl-ts.crt file. For internal connections, this value will be /certs/lh-ssl-ts.crt . Once connected, you can run an SQL statement and return the results. cur.execute(\"SELECT * FROM tpch.tiny.customer\") rows = cur.fetchall() The rows variable contains the answer set from the select statement. You can manipulate the row variable to view the results. rows[0] [1, 'Customer#000000001', 'IVhzIApeRb ot,c,E', 15, '25-989-741-2988', 711.56, 'BUILDING', 'to the even, regular platelets. regular, ironic epitaphs nag e'] The PrestoDB driver supports the DBAPI spec. For more details on the use of the DBAPI interface, please refer to https://peps.python.org/pep-0249/ . For instance, if you want to find the description of the columns returned, you would use the description function. cur.description [('custkey', 'bigint', None, None, None, None, None), ('name', 'varchar(25)', None, None, None, None, None), ('address', 'varchar(40)', None, None, None, None, None), ('nationkey', 'bigint', None, None, None, None, None), ('phone', 'varchar(15)', None, None, None, None, None), ('acctbal', 'double', None, None, None, None, None), ('mktsegment', 'varchar(10)', None, None, None, None, None), ('comment', 'varchar(117)', None, None, None, None, None)]","title":"Python and Jupyter Connection Code"},{"location":"wxd-connections/#accessing-watsonxdata-via-pandas-dataframes","text":"The following code is required for accessing watsonx.data in Jupyter notebooks. Run the following code inside a notebook code cell. %pip install ipython-sql==0.4.1 %pip install sqlalchemy==1.4.46 %pip install sqlalchemy==1.4.46 \"pyhive[presto]\" The notebook may need a restart of the kernel to pick up the changes to the driver. If you are running in a Jupyter Lab environment, you can use the most current versions of the drivers. %pip install ipython-sql %pip install sqlalchemy %pip install sqlalchemy \"pyhive[presto]\" Once the drivers have been loaded, you will need to extract the certificate from the watsonx.data server that we will use in the connection. scp -P port watsonx@region.techzone-server.com:/certs/presto-key.jks /Users/myname/Downloads Change the target directory to a location that you can remember! You can also download the certificate by using the Jupyter Notebook link and opening the Certificate notebook. In your Jupyter notebook, you will need to import a number of libraries. import pandas as pd import sqlalchemy from sqlalchemy import create_engine Create a notebook cell which will contain all the credentials that are required to connect. Change the catalog , schema and certfile to your values. userid = \"ibmlhadmin\" password = \"password\" hostname = \"watsonxdata\" port = \"8443\" catalog = \"tpch\" schema = \"tiny\" certfile = \"/certs/lh-ssl-ts.crt\" connect_args={ 'protocol': 'https', 'requests_kwargs': {'verify': f'{certfile}'} } In the above settings, you will need to replace the following values: hostname - region.techzone-server.com when connecting to the image externally, and ibm-lh-presto-svc when connecting internally catalog - What is the name of the catalog that we are accessing schema - The schema inside the catalog that will be used You also need to update the certfile value with the location where you downloaded the lh-ssl-ts.crt file. For internal connections, this value will be /certs/lh-ssl-ts.crt . To create a connection to the database, use the following syntax. engine = create_engine( f\"presto://{userid}:{password}@{hostname}:{port}/{catalog}/{schema}\", connect_args=connect_args ) Now that you have established a connection, you can use the Pandas read_sql_query function to execute a SELECT statement against the database. mypresto = pd.read_sql_query('SELECT * from tpch.tiny.customer',engine) The variable mypresto contains the dataframe generated from the SELECT statement. mypresto You can use the features of Pandas to generate plots of the data in your notebook. First make sure you have matplotlib installed. %pip install matplotlib The following query will compute the total account balance across all nation key values. sumbynation = pd.read_sql_query('SELECT \"nationkey\", sum(\"acctbal\") from tpch.tiny.customer group by \"nationkey\" order by 2',engine) Finally, we plot the results. df.plot(kind=\"bar\", x=\"FirstName\", y=\"LastName\") plt.show()","title":"Accessing watsonx.data via Pandas Dataframes"},{"location":"wxd-connections/#adding-a-service","text":"The watsonx.data developer edition includes two open ports which can be used to externalize a service that you create in the image. For instance, you may choose to create a MongoDB or MSSQL container using Docker and want to access this service from your own dBeaver or Mongo tooling. Since port numbers vary between different databases, the watsonx.data system provides two port numbers that can be used by your service. Open Port 1 - Server: region.techzone-services.com: Port: 12345 Open Port 2 - Server: region.techzone-services.com: Port: 23456 The internal port numbers are 10000 (Port 1) and 10001 (Port 2). The following steps are required to use these ports with your service.","title":"Adding a Service"},{"location":"wxd-connections/#open-the-local-firewall-version-110-only","text":"Ports 10000/1 are not open by default in the 1.1.0 image. For release 1.1.1, you can skip this step. You must explicitly open ports 10000/1 with the firewall-cmd command. In a command line shell, as the root user, enter the following commands: sudo su - firewall-cmd --add-port={10000/tcp,10001/tcp} --zone=public --permanent firewall-cmd --reload You can use the following command to check that the ports are now open. firewall-cmd --list-ports","title":"Open the local Firewall (Version 1.1.0 Only)"},{"location":"wxd-connections/#create-your-service","text":"When creating your service, make sure to map the internal Docker port to either port 10000 or 10001. If you cannot remap the port, see the section on port redirection. For instance, the following command will start Microsoft SQLServer in Docker by mapping the host port 10000 to the SQLServer port of 1433 to 10000 ( -p 10000:1443 ). docker run -e \"ACCEPT_EULA=Y\" -e \"MSSQL_SA_PASSWORD=Passw0rd12345678!\" \\ -p 10000:1433 --name mssql-server --hostname mssql-server \\ -d mcr.microsoft.com/mssql/server:2019-latest You can check the port mapping with the following command. docker container ls --format \"table {{.ID}}\\t{{.Names}}\\t{{.Ports}}\" -a | grep mssql-server When creating a connection to this database using an external tool, make sure to use the port number supplied in the reservation details (Open Port 1 is for port 10000 and Open Port 2 is for port 10001).","title":"Create your Service"},{"location":"wxd-connections/#port-redirection","text":"If you already have an existing service mapped to a different port, you can use port redirection to use either port 10000 or 10001. For instance, assume that the previous creation of the SQLServer database used port 1433. docker run -e \"ACCEPT_EULA=Y\" -e \"MSSQL_SA_PASSWORD=Passw0rd12345678!\" \\ -p 1433:1433 --name mssql-server --hostname mssql-server \\ -d mcr.microsoft.com/mssql/server:2019-latest Once the service is up and running, you can redirect the traffic on port 10000/1 to the port of your docker service. firewall-cmd --add-forward-port=port=10000:proto=tcp:toport=1433 --permanent --zone=public firewall-cmd --reload If you need to remove the redirection, use the following command. firewall-cmd --remove-forward-port=port=10000:proto=tcp:toport=1433:toaddr= --permanent --zone=public firewall-cmd --reload","title":"Port Redirection"},{"location":"wxd-connections/#accessing-your-service","text":"When referring to your service from an external location, always use the port numbers that are provided for Open Port 1 or 2. Open Port 1 - Server: region.techzone-services.com: Port: 12345 Open Port 2 - Server: region.techzone-services.com: Port: 23456 Your server will be region.techzone-services.com and the port number will be either of the two port numbers provided. Remember that this port number will need to be opened in the server and a Docker mapping to the open port or a firewall port redirection will be required.","title":"Accessing your Service"},{"location":"wxd-datasets-gosales/","text":"Great Outdoors Company The Sample Outdoors Company, or GO Sales, or any variation of the Sample Outdoors name, is the name of a fictitious business operation whose sample data is used to develop sample applications for IBM\u00ae and IBM customers. Its fictitious records include sample data for sales transactions, product distribution, finance, and human resources. Any resemblance to actual names, addresses, contact numbers, or transaction values, is coincidental. Two links that provide more details on the database. Great Outdoors Company Great Outdoors Database Reference The second link will say that there is no content available, but if you click on the down arrow you will see the table names. Disclaimer The Sample Outdoors Company, or GO Sales, or any variation of the Sample Outdoors name, is the name of a fictitious business operation whose sample data is used to develop sample applications for IBM\u00ae and IBM customers. Its fictitious records include sample data for sales transactions, product distribution, finance, and human resources. Any resemblance to actual names, addresses, contact numbers, or transaction values, is coincidental. Unauthorized duplication is prohibited. Table Definitions These tables are created under the GOSALESDW schema. Thanks to Michael Schapira for generating the following ER diagram. You may need to download the image to zoom in on the relationships. DIST_INVENTORY_FACT Column Type MONTH_KEY INTEGER ORGANIZATION_KEY INTEGER BRANCH_KEY INTEGER PRODUCT_KEY INTEGER OPENING_INVENTORY INTEGER QUANTITY_SHIPPED INTEGER ADDITIONS INTEGER UNIT_COST DECIMAL CLOSING_INVENTORY INTEGER AVERAGE_UNIT_COST DECIMAL DIST_PRODUCT_FORECASE_FACT Column Type MONTH_KEY INTEGER ORGANIZATION_KEY INTEGER BASE_PRODUCT_KEY INTEGER BRANCH_KEY INTEGER UNIT_COST DECIMAL UNIT_PRICE DECIMAL EXPECTED_VOLUME INTEGER DIST_RETURNED_ITEMS_FACT Column Type DAY_KEY INTEGER ORGANIZATION_KEY INTEGER BRANCH_KEY INTEGER EMPLOYEE_KEY INTEGER RETAILER_SITE_KEY INTEGER PRODUCT_KEY INTEGER ORDER_METHOD_KEY INTEGER SALES_ORDER_KEY INTEGER RETURN_REASON_KEY INTEGER RETURN_QUANTITY INTEGER DIST_RETURN_REASON_DIM Column Type RETURN_REASON_KEY INTEGER RETURN_REASON_CODE INTEGER REASON_DESCRIPTION_EN VARCHAR EMP_EMPLOYEE_DIM Column Type EMPLOYEE_KEY INTEGER MANAGER_CODE1 INTEGER MANAGER1 VARCHAR MANAGER_MB1 VARCHAR MANAGER_CODE2 INTEGER MANAGER2 VARCHAR MANAGER_MB2 VARCHAR MANAGER_CODE3 INTEGER MANAGER3 VARCHAR MANAGER_MB3 VARCHAR MANAGER_CODE4 INTEGER MANAGER4 VARCHAR MANAGER_MB4 VARCHAR MANAGER_CODE5 INTEGER MANAGER5 VARCHAR MANAGER_MB5 VARCHAR MANAGER_CODE6 INTEGER MANAGER6 VARCHAR MANAGER_MB6 VARCHAR EMPLOYEE_CODE INTEGER EMPLOYEE_NAME VARCHAR FIRST_NAME VARCHAR LAST_NAME VARCHAR EMPLOYEE_NAME_MB VARCHAR FIRST_NAME_MB VARCHAR LAST_NAME_MB VARCHAR MANAGER_CODE INTEGER ORGANIZATION_CODE VARCHAR ADDRESS1 VARCHAR ADDRESS2 VARCHAR ADDRESS1_MB VARCHAR ADDRESS2_MB VARCHAR CITY VARCHAR CITY_MB VARCHAR PROV_STATE VARCHAR PROV_STATE_MB VARCHAR POSTAL_ZONE VARCHAR BRANCH_CODE INTEGER BIRTH_DATE DATE GENDER_CODE SMALLINT WORK_PHONE VARCHAR EXTENSION VARCHAR FAX VARCHAR EMAIL VARCHAR DATE_HIRED DATE TERMINATION_CODE INTEGER TERMINATION_DATE DATE POSITION_START_DATE DATE POSITION_CODE INTEGER EMPLOYEE_LEVEL SMALLINT ACTIVE_INDICATOR SMALLINT RECORD_START_DATE DATE RECORD_END_DATE DATE MANAGER_KEY INTEGER EMP_EXPENSE_FACT Column Type DAY_KEY INTEGER ORGANIZATION_KEY INTEGER POSITION_KEY INTEGER EMPLOYEE_KEY INTEGER EXPENSE_TYPE_KEY INTEGER ACCOUNT_KEY INTEGER EXPENSE_UNIT_QUANTITY FLOAT EXPENSE_TOTAL DECIMAL EMP_EXPENSE_PLAN_FACT Column Type MONTH_KEY INTEGER ORGANIZATION_KEY INTEGER EXPENSE_TYPE_KEY INTEGER ACCOUNT_KEY VARCHAR EXPENSE_PLAN_TOTAL DECIMAL EMP_EXPENSE_TYPE_DIM Column Type EXPENSE_TYPE_KEY INTEGER EXPENSE_GROUP_CODE INTEGER EXPENSE_TYPE_CODE INTEGER EXPENSE_UNIT_CODE INTEGER EXPENSE_GROUP_EN VARCHAR EXPENSE_TYPE_EN VARCHAR EMP_EXPENSE_UNIT_LOOKUP Column Type EXPENSE_UNIT_CODE INTEGER EXPENSE_UNIT_EN VARCHAR EMP_POSITION_DIM Column Type POSITION_KEY INTEGER POSITION_CODE1 INTEGER POSITION_CODE2 INTEGER POSITION_CODE3 INTEGER POSITION_CODE INTEGER POSITION_PARENT INTEGER MIN_SALARY DECIMAL MAX_SALARY DECIMAL PAID_HOURLY INTEGER POSITION_LEVEL SMALLINT EMP_POSITION_LOOKUP Column Type POSITION_CODE INTEGER POSITION_EN VARCHAR EMP_POSITION_SUMMARY_FACT Column Type DAY_KEY INTEGER ORGANIZATION_KEY INTEGER POSITION_KEY INTEGER POSITION_COUNT INTEGER PLANNED_POSITION_COUNT INTEGER INTERNAL_HIRES INTEGER EXTERNAL_HIRES INTEGER TERMINATIONS INTEGER EMP_RANKING_DIM Column Type EMPLOYEE_RANKING_KEY INTEGER RANKING_CODE INTEGER RANKING_DESCRIPTION_EN VARCHAR EMP_RANKING_FACT Column Type ORGANIZATION_KEY INTEGER POSITION_KEY INTEGER EMPLOYEE_KEY INTEGER DAY_KEY INTEGER EMPLOYEE_RANKING_KEY INTEGER RANKING_CODE INTEGER EMP_RECRUITMENT_DIM Column Type RECRUITMENT_MEDIUM_KEY INTEGER RECRUITMENT_MEDIUM_CODE INTEGER RECRUITMENT_TYPE_CODE INTEGER RECRUITMENT_MEDIUM_NAME_EN VARCHAR RECRUITMENT_TYPE_EN VARCHAR EMP_RECRUITMENT_FACT Column Type POST_DAY_KEY INTEGER RECRUITMENT_MEDIUM_KEY INTEGER ORGANIZATION_KEY INTEGER BRANCH_KEY INTEGER POSITION_KEY INTEGER POSITION_POSTING_DATE DATE POSITION_FILLED_DATE DATE POSITION_START_DATE DATE DAYS_TO_FILL INTEGER EMP_SUCCESSION_FACT Column Type DAY_KEY INTEGER ORGANIZATION_KEY INTEGER POSITION_KEY INTEGER EMPLOYEE_KEY INTEGER SUCCESSOR_EMPLOYEE_KEY INTEGER SUCCESSOR_POSITION_KEY INTEGER SUCCESSOR_STATUS_KEY INTEGER PERCENT_READY FLOAT TARGET_PERCENT_READY FLOAT EMP_SUCCESSION_STATUS_DIM Column Type SUCCESSOR_STATUS_KEY INTEGER SUCCESSOR_STATUS_CODE INTEGER SUCCESSOR_STATUS_EN VARCHAR EMP_SUMMARY_FACT Column Type DAY_KEY INTEGER ORGANIZATION_KEY INTEGER POSITION_KEY INTEGER EMPLOYEE_KEY INTEGER SALARY DECIMAL PAY_INCREASE FLOAT BONUS FLOAT VACATION_DAYS_TAKEN FLOAT SICK_DAYS_TAKEN FLOAT EMP_SURVEY_FACT Column Type DAY_KEY INTEGER ORGANIZATION_KEY INTEGER POSITION_KEY INTEGER EMPLOYEE_TOPIC_KEY INTEGER EMPLOYEE_TOPIC_SCORE FLOAT SATISFACTION_KEY INTEGER EMP_SURVEY_TARG_FACT Column Type MONTH_KEY INTEGER EMPLOYEE_TOPIC_KEY INTEGER EMPLOYEE_TOPIC_TARGET FLOAT EMPLOYEE_SURVEY_BENCHMARK FLOAT EMP_SURVEY_TOPIC_DIM Column Type EMPLOYEE_TOPIC_KEY INTEGER EMPLOYEE_TOPIC_CODE INTEGER EMPLOYEE_TOPIC_EN VARCHAR EMP_TERMINATION_LOOKUP Column Type TERMINATION_CODE INTEGER TERMINATION_REASON_EN VARCHAR EMP_TRAINING_DIM Column Type TRAINING_KEY INTEGER COURSE_CODE INTEGER COURSE_COST DECIMAL COURSE_DAYS FLOAT COURSE_NAME_EN VARCHAR EMP_TRAINING_FACT Column Type ORGANIZATION_KEY INTEGER POSITION_KEY INTEGER EMPLOYEE_KEY INTEGER DAY_KEY INTEGER EXPENSE_TYPE_KEY INTEGER TRAINING_KEY INTEGER COURSE_COST DECIMAL COURSE_DAYS FLOAT FIN_ACCOUNT_CLASS_LOOKUP Column Type ACCOUNT_CLASS_CODE INTEGER ACCOUNT_CLASS_EN VARCHAR FIN_ACCOUNT_DIM Column Type ACCOUNT_KEY INTEGER ACCOUNT_CODE1 VARCHAR ACCOUNT_CODE2 VARCHAR ACCOUNT_CODE3 VARCHAR ACCOUNT_CODE4 VARCHAR ACCOUNT_CODE5 VARCHAR ACCOUNT_CODE6 VARCHAR ACCOUNT_CODE7 VARCHAR ACCOUNT_CODE8 VARCHAR ACCOUNT_CODE9 VARCHAR ACCOUNT_CODE10 VARCHAR ACCOUNT_CODE11 VARCHAR ACCOUNT_CODE12 VARCHAR ACCOUNT_CODE13 VARCHAR ACCOUNT_CODE14 VARCHAR ACCOUNT_CODE15 VARCHAR ACCOUNT_CODE16 VARCHAR ACCOUNT_CODE VARCHAR ACCOUNT_PARENT VARCHAR DEBIT_OR_CREDIT CHAR(3) ACCOUNT_TYPE_CODE CHAR(3) ACCOUNT_CLASS_CODE INTEGER ACCOUNT_LEVEL INTEGER AGGREGATION_SIGN CHAR(3) FIN_ACCOUNT_NAME_LOOKUP Column Type ACCOUNT_CODE VARCHAR ACCOUNT_NAME_EN VARCHAR FIN_ACCOUNT_TYPE_LOOKUP Column Type ACCOUNT_TYPE_CODE CHAR(3) ACCOUNT_TYPE_EN VARCHAR FIN_FINANCE_FACT Column Type SUBMISSION_KEY INTEGER ORGANIZATION_KEY INTEGER ACCOUNT_KEY INTEGER MONTH_KEY INTEGER AMOUNT_YEAR_TO_DATE DECIMAL AMOUNT_MONTH DECIMAL FIN_SUBM_CURRENCY_LOOKUP Column Type SUBMISSION_CURRENCY_CODE VARCHAR SUBMISSION_CURRENCY_EN VARCHAR FIN_SUBM_DIM Column Type UBMISSION_KEY INTEGER UBMISSION_CODE VARCHAR UBMISSION_NAME_EN VARCHAR UBMISSION_YEAR INTEGER UBMISSION_TYPE_CODE CHAR(3) UBMISSION_CURRENCY_CODE VARCHAR FIN_SUBM_TYPE_LOOKUP Column Type SUBMISSION_TYPE_CODE CHAR(3) SUBMISSION_TYPE_EN VARCHAR GO_BRANCH_DIM Column Type BRANCH_KEY INTEGER BRANCH_CODE INTEGER ADDRESS1 VARCHAR ADDRESS2 VARCHAR CITY VARCHAR PROV_STATE VARCHAR ADDRESS1_MB VARCHAR ADDRESS2_MB VARCHAR CITY_MB VARCHAR PROV_STATE_MB VARCHAR POSTAL_ZONE VARCHAR COUNTRY_CODE INTEGER WAREHOUSE_BRANCH_CODE INTEGER GO_GENDER_LOOKUP Column Type GENDER_CODE SMALLINT GENDER_EN VARCHAR GO_ORG_DIM Column Type ORGANIZATION_KEY INTEGER ORGANIZATION_CODE1 VARCHAR ORGANIZATION_CODE2 VARCHAR ORGANIZATION_CODE3 VARCHAR ORGANIZATION_CODE4 VARCHAR ORGANIZATION_CODE VARCHAR ORGANIZATION_PARENT VARCHAR ORGANIZATION_LEVEL SMALLINT GO_ORG_NAME_LOOKUP Column Type ORGANIZATION_CODE VARCHAR ORGANIZATION_NAME_EN VARCHAR GO_REGION_DIM Column Type COUNTRY_KEY INTEGER COUNTRY_CODE INTEGER FLAG_IMAGE VARCHAR ISO_THREE_LETTER_CODE VARCHAR ISO_TWO_LETTER_CODE VARCHAR ISO_THREE_DIGIT_CODE VARCHAR REGION_KEY INTEGER REGION_CODE INTEGER REGION_EN VARCHAR COUNTRY_EN VARCHAR GO_SATISFACTION_DIM Column Type SATISFACTION_KEY INTEGER SATISFACTION_CODE INTEGER SATISFACTION_LOWER_LIMIT FLOAT SATISFACTION_UPPER_LIMIT FLOAT SATISFACTION_DESCRIPTION_EN VARCHAR GO_TIME_DIM Column Type DAY_KEY INTEGER DAY_DATE DATE MONTH_KEY INTEGER CURRENT_MONTH SMALLINT MONTH_NUMBER INTEGER QUARTER_KEY INTEGER CURRENT_QUARTER SMALLINT CURRENT_YEAR SMALLINT DAY_OF_WEEK SMALLINT DAY_OF_MONTH SMALLINT DAYS_IN_MONTH SMALLINT DAY_OF_YEAR SMALLINT WEEK_OF_MONTH SMALLINT WEEK_OF_QUARTER SMALLINT WEEK_OF_YEAR SMALLINT MONTH_EN VARCHAR WEEKDAY_EN VARCHAR GO_TIME_QUARTER_LOOKUP Column Type QUARTER_KEY INTEGER QUARTER_EN VARCHAR MRK_ACTIVITY_STATUS_DIM Column Type ACTIVITY_STATUS_KEY INTEGER ACTIVITY_STATUS_CODE SMALLINT ACTIVITY_STATUS_EN VARCHAR MRK_BUNDLE_GROUP_LOOKUP Column Type BUNDLE_GROUP_CODE INTEGER BUNDLE_GROUP_EN VARCHAR MRK_CAMPAIGN_LOOKUP Column Type CAMPAIGN_CODE INTEGER CAMPAIGN_NAME_EN VARCHAR MRK_PRODUCT_SURVEY_DIM Column Type PRODUCT_SURVEY_KEY INTEGER PRODUCT_TOPIC_CODE INTEGER PRODUCT_TOPIC_EN VARCHAR MRK_PRODUCT_SURVEY_FACT Column Type MONTH_KEY INTEGER ORGANIZATION_KEY INTEGER RTL_COUNTRY_KEY INTEGER BRANCH_KEY INTEGER PRODUCT_KEY INTEGER PRODUCT_SURVEY_KEY INTEGER PRODUCT_TOPIC_SCORE FLOAT MRK_PROD_SURVEY_TARG_FACT Column Type MONTH_KEY INTEGER PRODUCT_KEY INTEGER PRODUCT_SURVEY_KEY INTEGER PRODUCT_TOPIC_TARGET FLOAT MRK_PROMOTION_DIM Column Type PROMOTION_KEY INTEGER PROMOTION_CODE INTEGER CAMPAIGN_CODE INTEGER BUNDLE_GROUP_CODE INTEGER PROMOTION_NAME_EN VARCHAR MRK_PROMOTION_FACT Column Type ORGANIZATION_KEY INTEGER ORDER_DAY_KEY INTEGER RTL_COUNTRY_KEY INTEGER EMPLOYEE_KEY INTEGER RETAILER_KEY INTEGER PRODUCT_KEY INTEGER PROMOTION_KEY INTEGER SALES_ORDER_KEY INTEGER QUANTITY SMALLINT UNIT_COST DECIMAL UNIT_PRICE DECIMAL UNIT_SALE_PRICE DECIMAL GROSS_MARGIN FLOAT SALE_TOTAL DECIMAL GROSS_PROFIT DECIMAL MRK_PROMOTION_PLAN_FACT Column Type MONTH_KEY INTEGER ORGANIZATION_KEY INTEGER BRANCH_KEY INTEGER RTL_COUNTRY_KEY INTEGER PRODUCT_KEY INTEGER PROMOTION_KEY INTEGER UNIT_COST DECIMAL UNIT_PRICE DECIMAL UNIT_SALE_PRICE DECIMAL PROMOTION_PLAN_QUANTITY INTEGER PROMOTION_PLAN_REVENUE DECIMAL MRK_RTL_SURVEY_DIM Column Type RETAILER_SURVEY_KEY INTEGER RETAILER_TOPIC_CODE INTEGER RETAILER_TOPIC_EN VARCHAR MRK_RTL_SURVEY_FACT Column Type MONTH_KEY INTEGER ORGANIZATION_KEY INTEGER BRANCH_KEY INTEGER RTL_COUNTRY_KEY INTEGER RETAILER_KEY INTEGER RETAILER_SURVEY_KEY INTEGER RETAILER_TOPIC_SCORE FLOAT MRK_RTL_SURVEY_TARG_FACT Column Type RETAILER_SURVEY_KEY INTEGER MONTH_KEY INTEGER RETAILER_TOPIC_WEIGHT INTEGER RETAILER_TOPIC_TARGET FLOAT SLS_ORDER_METHOD_DIM Column Type ORDER_METHOD_KEY INTEGER ORDER_METHOD_CODE INTEGER ORDER_METHOD_EN VARCHAR SLS_PRODUCT_BRAND_LOOKUP Column Type PRODUCT_BRAND_CODE INTEGER PRODUCT_BRAND_EN VARCHAR SLS_PRODUCT_COLOR_LOOKUP Column Type PRODUCT_COLOR_CODE INTEGER PRODUCT_COLOR_EN VARCHAR SLS_PRODUCT_DIM Column Type PRODUCT_KEY INTEGER PRODUCT_LINE_CODE INTEGER PRODUCT_TYPE_KEY INTEGER PRODUCT_TYPE_CODE INTEGER PRODUCT_NUMBER INTEGER BASE_PRODUCT_KEY INTEGER BASE_PRODUCT_NUMBER INTEGER PRODUCT_COLOR_CODE INTEGER PRODUCT_SIZE_CODE INTEGER PRODUCT_BRAND_KEY INTEGER PRODUCT_BRAND_CODE INTEGER PRODUCT_IMAGE VARCHAR INTRODUCTION_DATE DATE DISCONTINUED_DATE DATE SLS_PRODUCT_LINE_LOOKUP Column Type PRODUCT_LINE_CODE INTEGER PRODUCT_LINE_EN VARCHAR SLS_PRODUCT_LOOKUP Column Type PRODUCT_NUMBER INTEGER PRODUCT_LANGUAGE VARCHAR PRODUCT_NAME VARCHAR PRODUCT_DESCRIPTION VARCHAR SLS_PRODUCT_SIZE_LOOKUP Column Type PRODUCT_SIZE_CODE INTEGER PRODUCT_SIZE_EN VARCHAR SLS_PRODUCT_TYPE_LOOKUP Column Type PRODUCT_TYPE_CODE INTEGER PRODUCT_TYPE_EN VARCHAR SLS_RTL_DIM Column Type RETAILER_SITE_KEY INTEGER RETAILER_SITE_CODE INTEGER RETAILER_KEY INTEGER RETAILER_CODE INTEGER RETAILER_NAME VARCHAR RETAILER_NAME_MB VARCHAR RETAILER_CONTACT_CODE INTEGER CONTACT_FIRST_NAME VARCHAR CONTACT_LAST_NAME VARCHAR GENDER_CODE SMALLINT CONTACT_PHONE_NUMBER VARCHAR CONTACT_EXTENSION VARCHAR CONTACT_FAX VARCHAR CONTACT_EMAIL VARCHAR RTL_ADDRESS1 VARCHAR RTL_ADDRESS2 VARCHAR RTL_CITY VARCHAR RTL_PROV_STATE VARCHAR CONTACT_FIRST_NAME_MB VARCHAR CONTACT_LAST_NAME_MB VARCHAR RTL_ADDRESS1_MB VARCHAR RTL_ADDRESS2_MB VARCHAR RTL_CITY_MB VARCHAR RTL_PROV_STATE_MB VARCHAR RTL_POSTAL_ZONE VARCHAR RTL_COUNTRY_CODE INTEGER RETAILER_START_DATE DATE RETAILER_TYPE_CODE INTEGER RETAILER_TYPE_EN VARCHAR JOB_POSITION_EN VARCHAR SLS_SALES_FACT Column Type ORDER_DAY_KEY INTEGER ORGANIZATION_KEY INTEGER EMPLOYEE_KEY INTEGER RETAILER_KEY INTEGER RETAILER_SITE_KEY INTEGER PRODUCT_KEY INTEGER PROMOTION_KEY INTEGER ORDER_METHOD_KEY INTEGER SALES_ORDER_KEY INTEGER SHIP_DAY_KEY INTEGER CLOSE_DAY_KEY INTEGER QUANTITY BIGINT UNIT_COST DECIMAL UNIT_PRICE DECIMAL UNIT_SALE_PRICE DECIMAL GROSS_MARGIN FLOAT SALE_TOTAL DECIMAL GROSS_PROFIT DECIMAL SLS_SALES_ORDER_DIM Column Type SALES_ORDER_KEY INTEGER ORDER_DETAIL_CODE INTEGER ORDER_NUMBER INTEGER WAREHOUSE_BRANCH_CODE INTEGER SLS_SALES_TARG_FACT Column Type MONTH_KEY INTEGER ORGANIZATION_KEY INTEGER RTL_COUNTRY_KEY INTEGER EMPLOYEE_KEY INTEGER RETAILER_KEY INTEGER PRODUCT_TYPE_KEY INTEGER PRODUCT_BRAND_KEY INTEGER SALES_TARGET DECIMAL","title":"Great Outdoors Company"},{"location":"wxd-datasets-gosales/#great-outdoors-company","text":"The Sample Outdoors Company, or GO Sales, or any variation of the Sample Outdoors name, is the name of a fictitious business operation whose sample data is used to develop sample applications for IBM\u00ae and IBM customers. Its fictitious records include sample data for sales transactions, product distribution, finance, and human resources. Any resemblance to actual names, addresses, contact numbers, or transaction values, is coincidental. Two links that provide more details on the database. Great Outdoors Company Great Outdoors Database Reference The second link will say that there is no content available, but if you click on the down arrow you will see the table names.","title":"Great Outdoors Company"},{"location":"wxd-datasets-gosales/#disclaimer","text":"The Sample Outdoors Company, or GO Sales, or any variation of the Sample Outdoors name, is the name of a fictitious business operation whose sample data is used to develop sample applications for IBM\u00ae and IBM customers. Its fictitious records include sample data for sales transactions, product distribution, finance, and human resources. Any resemblance to actual names, addresses, contact numbers, or transaction values, is coincidental. Unauthorized duplication is prohibited.","title":"Disclaimer"},{"location":"wxd-datasets-gosales/#table-definitions","text":"These tables are created under the GOSALESDW schema. Thanks to Michael Schapira for generating the following ER diagram. You may need to download the image to zoom in on the relationships.","title":"Table Definitions"},{"location":"wxd-datasets-gosales/#dist_inventory_fact","text":"Column Type MONTH_KEY INTEGER ORGANIZATION_KEY INTEGER BRANCH_KEY INTEGER PRODUCT_KEY INTEGER OPENING_INVENTORY INTEGER QUANTITY_SHIPPED INTEGER ADDITIONS INTEGER UNIT_COST DECIMAL CLOSING_INVENTORY INTEGER AVERAGE_UNIT_COST DECIMAL","title":"DIST_INVENTORY_FACT"},{"location":"wxd-datasets-gosales/#dist_product_forecase_fact","text":"Column Type MONTH_KEY INTEGER ORGANIZATION_KEY INTEGER BASE_PRODUCT_KEY INTEGER BRANCH_KEY INTEGER UNIT_COST DECIMAL UNIT_PRICE DECIMAL EXPECTED_VOLUME INTEGER","title":"DIST_PRODUCT_FORECASE_FACT"},{"location":"wxd-datasets-gosales/#dist_returned_items_fact","text":"Column Type DAY_KEY INTEGER ORGANIZATION_KEY INTEGER BRANCH_KEY INTEGER EMPLOYEE_KEY INTEGER RETAILER_SITE_KEY INTEGER PRODUCT_KEY INTEGER ORDER_METHOD_KEY INTEGER SALES_ORDER_KEY INTEGER RETURN_REASON_KEY INTEGER RETURN_QUANTITY INTEGER","title":"DIST_RETURNED_ITEMS_FACT"},{"location":"wxd-datasets-gosales/#dist_return_reason_dim","text":"Column Type RETURN_REASON_KEY INTEGER RETURN_REASON_CODE INTEGER REASON_DESCRIPTION_EN VARCHAR","title":"DIST_RETURN_REASON_DIM"},{"location":"wxd-datasets-gosales/#emp_employee_dim","text":"Column Type EMPLOYEE_KEY INTEGER MANAGER_CODE1 INTEGER MANAGER1 VARCHAR MANAGER_MB1 VARCHAR MANAGER_CODE2 INTEGER MANAGER2 VARCHAR MANAGER_MB2 VARCHAR MANAGER_CODE3 INTEGER MANAGER3 VARCHAR MANAGER_MB3 VARCHAR MANAGER_CODE4 INTEGER MANAGER4 VARCHAR MANAGER_MB4 VARCHAR MANAGER_CODE5 INTEGER MANAGER5 VARCHAR MANAGER_MB5 VARCHAR MANAGER_CODE6 INTEGER MANAGER6 VARCHAR MANAGER_MB6 VARCHAR EMPLOYEE_CODE INTEGER EMPLOYEE_NAME VARCHAR FIRST_NAME VARCHAR LAST_NAME VARCHAR EMPLOYEE_NAME_MB VARCHAR FIRST_NAME_MB VARCHAR LAST_NAME_MB VARCHAR MANAGER_CODE INTEGER ORGANIZATION_CODE VARCHAR ADDRESS1 VARCHAR ADDRESS2 VARCHAR ADDRESS1_MB VARCHAR ADDRESS2_MB VARCHAR CITY VARCHAR CITY_MB VARCHAR PROV_STATE VARCHAR PROV_STATE_MB VARCHAR POSTAL_ZONE VARCHAR BRANCH_CODE INTEGER BIRTH_DATE DATE GENDER_CODE SMALLINT WORK_PHONE VARCHAR EXTENSION VARCHAR FAX VARCHAR EMAIL VARCHAR DATE_HIRED DATE TERMINATION_CODE INTEGER TERMINATION_DATE DATE POSITION_START_DATE DATE POSITION_CODE INTEGER EMPLOYEE_LEVEL SMALLINT ACTIVE_INDICATOR SMALLINT RECORD_START_DATE DATE RECORD_END_DATE DATE MANAGER_KEY INTEGER","title":"EMP_EMPLOYEE_DIM"},{"location":"wxd-datasets-gosales/#emp_expense_fact","text":"Column Type DAY_KEY INTEGER ORGANIZATION_KEY INTEGER POSITION_KEY INTEGER EMPLOYEE_KEY INTEGER EXPENSE_TYPE_KEY INTEGER ACCOUNT_KEY INTEGER EXPENSE_UNIT_QUANTITY FLOAT EXPENSE_TOTAL DECIMAL","title":"EMP_EXPENSE_FACT"},{"location":"wxd-datasets-gosales/#emp_expense_plan_fact","text":"Column Type MONTH_KEY INTEGER ORGANIZATION_KEY INTEGER EXPENSE_TYPE_KEY INTEGER ACCOUNT_KEY VARCHAR EXPENSE_PLAN_TOTAL DECIMAL","title":"EMP_EXPENSE_PLAN_FACT"},{"location":"wxd-datasets-gosales/#emp_expense_type_dim","text":"Column Type EXPENSE_TYPE_KEY INTEGER EXPENSE_GROUP_CODE INTEGER EXPENSE_TYPE_CODE INTEGER EXPENSE_UNIT_CODE INTEGER EXPENSE_GROUP_EN VARCHAR EXPENSE_TYPE_EN VARCHAR","title":"EMP_EXPENSE_TYPE_DIM"},{"location":"wxd-datasets-gosales/#emp_expense_unit_lookup","text":"Column Type EXPENSE_UNIT_CODE INTEGER EXPENSE_UNIT_EN VARCHAR","title":"EMP_EXPENSE_UNIT_LOOKUP"},{"location":"wxd-datasets-gosales/#emp_position_dim","text":"Column Type POSITION_KEY INTEGER POSITION_CODE1 INTEGER POSITION_CODE2 INTEGER POSITION_CODE3 INTEGER POSITION_CODE INTEGER POSITION_PARENT INTEGER MIN_SALARY DECIMAL MAX_SALARY DECIMAL PAID_HOURLY INTEGER POSITION_LEVEL SMALLINT","title":"EMP_POSITION_DIM"},{"location":"wxd-datasets-gosales/#emp_position_lookup","text":"Column Type POSITION_CODE INTEGER POSITION_EN VARCHAR","title":"EMP_POSITION_LOOKUP"},{"location":"wxd-datasets-gosales/#emp_position_summary_fact","text":"Column Type DAY_KEY INTEGER ORGANIZATION_KEY INTEGER POSITION_KEY INTEGER POSITION_COUNT INTEGER PLANNED_POSITION_COUNT INTEGER INTERNAL_HIRES INTEGER EXTERNAL_HIRES INTEGER TERMINATIONS INTEGER","title":"EMP_POSITION_SUMMARY_FACT"},{"location":"wxd-datasets-gosales/#emp_ranking_dim","text":"Column Type EMPLOYEE_RANKING_KEY INTEGER RANKING_CODE INTEGER RANKING_DESCRIPTION_EN VARCHAR","title":"EMP_RANKING_DIM"},{"location":"wxd-datasets-gosales/#emp_ranking_fact","text":"Column Type ORGANIZATION_KEY INTEGER POSITION_KEY INTEGER EMPLOYEE_KEY INTEGER DAY_KEY INTEGER EMPLOYEE_RANKING_KEY INTEGER RANKING_CODE INTEGER","title":"EMP_RANKING_FACT"},{"location":"wxd-datasets-gosales/#emp_recruitment_dim","text":"Column Type RECRUITMENT_MEDIUM_KEY INTEGER RECRUITMENT_MEDIUM_CODE INTEGER RECRUITMENT_TYPE_CODE INTEGER RECRUITMENT_MEDIUM_NAME_EN VARCHAR RECRUITMENT_TYPE_EN VARCHAR","title":"EMP_RECRUITMENT_DIM"},{"location":"wxd-datasets-gosales/#emp_recruitment_fact","text":"Column Type POST_DAY_KEY INTEGER RECRUITMENT_MEDIUM_KEY INTEGER ORGANIZATION_KEY INTEGER BRANCH_KEY INTEGER POSITION_KEY INTEGER POSITION_POSTING_DATE DATE POSITION_FILLED_DATE DATE POSITION_START_DATE DATE DAYS_TO_FILL INTEGER","title":"EMP_RECRUITMENT_FACT"},{"location":"wxd-datasets-gosales/#emp_succession_fact","text":"Column Type DAY_KEY INTEGER ORGANIZATION_KEY INTEGER POSITION_KEY INTEGER EMPLOYEE_KEY INTEGER SUCCESSOR_EMPLOYEE_KEY INTEGER SUCCESSOR_POSITION_KEY INTEGER SUCCESSOR_STATUS_KEY INTEGER PERCENT_READY FLOAT TARGET_PERCENT_READY FLOAT","title":"EMP_SUCCESSION_FACT"},{"location":"wxd-datasets-gosales/#emp_succession_status_dim","text":"Column Type SUCCESSOR_STATUS_KEY INTEGER SUCCESSOR_STATUS_CODE INTEGER SUCCESSOR_STATUS_EN VARCHAR","title":"EMP_SUCCESSION_STATUS_DIM"},{"location":"wxd-datasets-gosales/#emp_summary_fact","text":"Column Type DAY_KEY INTEGER ORGANIZATION_KEY INTEGER POSITION_KEY INTEGER EMPLOYEE_KEY INTEGER SALARY DECIMAL PAY_INCREASE FLOAT BONUS FLOAT VACATION_DAYS_TAKEN FLOAT SICK_DAYS_TAKEN FLOAT","title":"EMP_SUMMARY_FACT"},{"location":"wxd-datasets-gosales/#emp_survey_fact","text":"Column Type DAY_KEY INTEGER ORGANIZATION_KEY INTEGER POSITION_KEY INTEGER EMPLOYEE_TOPIC_KEY INTEGER EMPLOYEE_TOPIC_SCORE FLOAT SATISFACTION_KEY INTEGER","title":"EMP_SURVEY_FACT"},{"location":"wxd-datasets-gosales/#emp_survey_targ_fact","text":"Column Type MONTH_KEY INTEGER EMPLOYEE_TOPIC_KEY INTEGER EMPLOYEE_TOPIC_TARGET FLOAT EMPLOYEE_SURVEY_BENCHMARK FLOAT","title":"EMP_SURVEY_TARG_FACT"},{"location":"wxd-datasets-gosales/#emp_survey_topic_dim","text":"Column Type EMPLOYEE_TOPIC_KEY INTEGER EMPLOYEE_TOPIC_CODE INTEGER EMPLOYEE_TOPIC_EN VARCHAR","title":"EMP_SURVEY_TOPIC_DIM"},{"location":"wxd-datasets-gosales/#emp_termination_lookup","text":"Column Type TERMINATION_CODE INTEGER TERMINATION_REASON_EN VARCHAR","title":"EMP_TERMINATION_LOOKUP"},{"location":"wxd-datasets-gosales/#emp_training_dim","text":"Column Type TRAINING_KEY INTEGER COURSE_CODE INTEGER COURSE_COST DECIMAL COURSE_DAYS FLOAT COURSE_NAME_EN VARCHAR","title":"EMP_TRAINING_DIM"},{"location":"wxd-datasets-gosales/#emp_training_fact","text":"Column Type ORGANIZATION_KEY INTEGER POSITION_KEY INTEGER EMPLOYEE_KEY INTEGER DAY_KEY INTEGER EXPENSE_TYPE_KEY INTEGER TRAINING_KEY INTEGER COURSE_COST DECIMAL COURSE_DAYS FLOAT","title":"EMP_TRAINING_FACT"},{"location":"wxd-datasets-gosales/#fin_account_class_lookup","text":"Column Type ACCOUNT_CLASS_CODE INTEGER ACCOUNT_CLASS_EN VARCHAR","title":"FIN_ACCOUNT_CLASS_LOOKUP"},{"location":"wxd-datasets-gosales/#fin_account_dim","text":"Column Type ACCOUNT_KEY INTEGER ACCOUNT_CODE1 VARCHAR ACCOUNT_CODE2 VARCHAR ACCOUNT_CODE3 VARCHAR ACCOUNT_CODE4 VARCHAR ACCOUNT_CODE5 VARCHAR ACCOUNT_CODE6 VARCHAR ACCOUNT_CODE7 VARCHAR ACCOUNT_CODE8 VARCHAR ACCOUNT_CODE9 VARCHAR ACCOUNT_CODE10 VARCHAR ACCOUNT_CODE11 VARCHAR ACCOUNT_CODE12 VARCHAR ACCOUNT_CODE13 VARCHAR ACCOUNT_CODE14 VARCHAR ACCOUNT_CODE15 VARCHAR ACCOUNT_CODE16 VARCHAR ACCOUNT_CODE VARCHAR ACCOUNT_PARENT VARCHAR DEBIT_OR_CREDIT CHAR(3) ACCOUNT_TYPE_CODE CHAR(3) ACCOUNT_CLASS_CODE INTEGER ACCOUNT_LEVEL INTEGER AGGREGATION_SIGN CHAR(3)","title":"FIN_ACCOUNT_DIM"},{"location":"wxd-datasets-gosales/#fin_account_name_lookup","text":"Column Type ACCOUNT_CODE VARCHAR ACCOUNT_NAME_EN VARCHAR","title":"FIN_ACCOUNT_NAME_LOOKUP"},{"location":"wxd-datasets-gosales/#fin_account_type_lookup","text":"Column Type ACCOUNT_TYPE_CODE CHAR(3) ACCOUNT_TYPE_EN VARCHAR","title":"FIN_ACCOUNT_TYPE_LOOKUP"},{"location":"wxd-datasets-gosales/#fin_finance_fact","text":"Column Type SUBMISSION_KEY INTEGER ORGANIZATION_KEY INTEGER ACCOUNT_KEY INTEGER MONTH_KEY INTEGER AMOUNT_YEAR_TO_DATE DECIMAL AMOUNT_MONTH DECIMAL","title":"FIN_FINANCE_FACT"},{"location":"wxd-datasets-gosales/#fin_subm_currency_lookup","text":"Column Type SUBMISSION_CURRENCY_CODE VARCHAR SUBMISSION_CURRENCY_EN VARCHAR","title":"FIN_SUBM_CURRENCY_LOOKUP"},{"location":"wxd-datasets-gosales/#fin_subm_dim","text":"Column Type UBMISSION_KEY INTEGER UBMISSION_CODE VARCHAR UBMISSION_NAME_EN VARCHAR UBMISSION_YEAR INTEGER UBMISSION_TYPE_CODE CHAR(3) UBMISSION_CURRENCY_CODE VARCHAR","title":"FIN_SUBM_DIM"},{"location":"wxd-datasets-gosales/#fin_subm_type_lookup","text":"Column Type SUBMISSION_TYPE_CODE CHAR(3) SUBMISSION_TYPE_EN VARCHAR","title":"FIN_SUBM_TYPE_LOOKUP"},{"location":"wxd-datasets-gosales/#go_branch_dim","text":"Column Type BRANCH_KEY INTEGER BRANCH_CODE INTEGER ADDRESS1 VARCHAR ADDRESS2 VARCHAR CITY VARCHAR PROV_STATE VARCHAR ADDRESS1_MB VARCHAR ADDRESS2_MB VARCHAR CITY_MB VARCHAR PROV_STATE_MB VARCHAR POSTAL_ZONE VARCHAR COUNTRY_CODE INTEGER WAREHOUSE_BRANCH_CODE INTEGER","title":"GO_BRANCH_DIM"},{"location":"wxd-datasets-gosales/#go_gender_lookup","text":"Column Type GENDER_CODE SMALLINT GENDER_EN VARCHAR","title":"GO_GENDER_LOOKUP"},{"location":"wxd-datasets-gosales/#go_org_dim","text":"Column Type ORGANIZATION_KEY INTEGER ORGANIZATION_CODE1 VARCHAR ORGANIZATION_CODE2 VARCHAR ORGANIZATION_CODE3 VARCHAR ORGANIZATION_CODE4 VARCHAR ORGANIZATION_CODE VARCHAR ORGANIZATION_PARENT VARCHAR ORGANIZATION_LEVEL SMALLINT","title":"GO_ORG_DIM"},{"location":"wxd-datasets-gosales/#go_org_name_lookup","text":"Column Type ORGANIZATION_CODE VARCHAR ORGANIZATION_NAME_EN VARCHAR","title":"GO_ORG_NAME_LOOKUP"},{"location":"wxd-datasets-gosales/#go_region_dim","text":"Column Type COUNTRY_KEY INTEGER COUNTRY_CODE INTEGER FLAG_IMAGE VARCHAR ISO_THREE_LETTER_CODE VARCHAR ISO_TWO_LETTER_CODE VARCHAR ISO_THREE_DIGIT_CODE VARCHAR REGION_KEY INTEGER REGION_CODE INTEGER REGION_EN VARCHAR COUNTRY_EN VARCHAR","title":"GO_REGION_DIM"},{"location":"wxd-datasets-gosales/#go_satisfaction_dim","text":"Column Type SATISFACTION_KEY INTEGER SATISFACTION_CODE INTEGER SATISFACTION_LOWER_LIMIT FLOAT SATISFACTION_UPPER_LIMIT FLOAT SATISFACTION_DESCRIPTION_EN VARCHAR","title":"GO_SATISFACTION_DIM"},{"location":"wxd-datasets-gosales/#go_time_dim","text":"Column Type DAY_KEY INTEGER DAY_DATE DATE MONTH_KEY INTEGER CURRENT_MONTH SMALLINT MONTH_NUMBER INTEGER QUARTER_KEY INTEGER CURRENT_QUARTER SMALLINT CURRENT_YEAR SMALLINT DAY_OF_WEEK SMALLINT DAY_OF_MONTH SMALLINT DAYS_IN_MONTH SMALLINT DAY_OF_YEAR SMALLINT WEEK_OF_MONTH SMALLINT WEEK_OF_QUARTER SMALLINT WEEK_OF_YEAR SMALLINT MONTH_EN VARCHAR WEEKDAY_EN VARCHAR","title":"GO_TIME_DIM"},{"location":"wxd-datasets-gosales/#go_time_quarter_lookup","text":"Column Type QUARTER_KEY INTEGER QUARTER_EN VARCHAR","title":"GO_TIME_QUARTER_LOOKUP"},{"location":"wxd-datasets-gosales/#mrk_activity_status_dim","text":"Column Type ACTIVITY_STATUS_KEY INTEGER ACTIVITY_STATUS_CODE SMALLINT ACTIVITY_STATUS_EN VARCHAR","title":"MRK_ACTIVITY_STATUS_DIM"},{"location":"wxd-datasets-gosales/#mrk_bundle_group_lookup","text":"Column Type BUNDLE_GROUP_CODE INTEGER BUNDLE_GROUP_EN VARCHAR","title":"MRK_BUNDLE_GROUP_LOOKUP"},{"location":"wxd-datasets-gosales/#mrk_campaign_lookup","text":"Column Type CAMPAIGN_CODE INTEGER CAMPAIGN_NAME_EN VARCHAR","title":"MRK_CAMPAIGN_LOOKUP"},{"location":"wxd-datasets-gosales/#mrk_product_survey_dim","text":"Column Type PRODUCT_SURVEY_KEY INTEGER PRODUCT_TOPIC_CODE INTEGER PRODUCT_TOPIC_EN VARCHAR","title":"MRK_PRODUCT_SURVEY_DIM"},{"location":"wxd-datasets-gosales/#mrk_product_survey_fact","text":"Column Type MONTH_KEY INTEGER ORGANIZATION_KEY INTEGER RTL_COUNTRY_KEY INTEGER BRANCH_KEY INTEGER PRODUCT_KEY INTEGER PRODUCT_SURVEY_KEY INTEGER PRODUCT_TOPIC_SCORE FLOAT","title":"MRK_PRODUCT_SURVEY_FACT"},{"location":"wxd-datasets-gosales/#mrk_prod_survey_targ_fact","text":"Column Type MONTH_KEY INTEGER PRODUCT_KEY INTEGER PRODUCT_SURVEY_KEY INTEGER PRODUCT_TOPIC_TARGET FLOAT","title":"MRK_PROD_SURVEY_TARG_FACT"},{"location":"wxd-datasets-gosales/#mrk_promotion_dim","text":"Column Type PROMOTION_KEY INTEGER PROMOTION_CODE INTEGER CAMPAIGN_CODE INTEGER BUNDLE_GROUP_CODE INTEGER PROMOTION_NAME_EN VARCHAR","title":"MRK_PROMOTION_DIM"},{"location":"wxd-datasets-gosales/#mrk_promotion_fact","text":"Column Type ORGANIZATION_KEY INTEGER ORDER_DAY_KEY INTEGER RTL_COUNTRY_KEY INTEGER EMPLOYEE_KEY INTEGER RETAILER_KEY INTEGER PRODUCT_KEY INTEGER PROMOTION_KEY INTEGER SALES_ORDER_KEY INTEGER QUANTITY SMALLINT UNIT_COST DECIMAL UNIT_PRICE DECIMAL UNIT_SALE_PRICE DECIMAL GROSS_MARGIN FLOAT SALE_TOTAL DECIMAL GROSS_PROFIT DECIMAL","title":"MRK_PROMOTION_FACT"},{"location":"wxd-datasets-gosales/#mrk_promotion_plan_fact","text":"Column Type MONTH_KEY INTEGER ORGANIZATION_KEY INTEGER BRANCH_KEY INTEGER RTL_COUNTRY_KEY INTEGER PRODUCT_KEY INTEGER PROMOTION_KEY INTEGER UNIT_COST DECIMAL UNIT_PRICE DECIMAL UNIT_SALE_PRICE DECIMAL PROMOTION_PLAN_QUANTITY INTEGER PROMOTION_PLAN_REVENUE DECIMAL","title":"MRK_PROMOTION_PLAN_FACT"},{"location":"wxd-datasets-gosales/#mrk_rtl_survey_dim","text":"Column Type RETAILER_SURVEY_KEY INTEGER RETAILER_TOPIC_CODE INTEGER RETAILER_TOPIC_EN VARCHAR","title":"MRK_RTL_SURVEY_DIM"},{"location":"wxd-datasets-gosales/#mrk_rtl_survey_fact","text":"Column Type MONTH_KEY INTEGER ORGANIZATION_KEY INTEGER BRANCH_KEY INTEGER RTL_COUNTRY_KEY INTEGER RETAILER_KEY INTEGER RETAILER_SURVEY_KEY INTEGER RETAILER_TOPIC_SCORE FLOAT","title":"MRK_RTL_SURVEY_FACT"},{"location":"wxd-datasets-gosales/#mrk_rtl_survey_targ_fact","text":"Column Type RETAILER_SURVEY_KEY INTEGER MONTH_KEY INTEGER RETAILER_TOPIC_WEIGHT INTEGER RETAILER_TOPIC_TARGET FLOAT","title":"MRK_RTL_SURVEY_TARG_FACT"},{"location":"wxd-datasets-gosales/#sls_order_method_dim","text":"Column Type ORDER_METHOD_KEY INTEGER ORDER_METHOD_CODE INTEGER ORDER_METHOD_EN VARCHAR","title":"SLS_ORDER_METHOD_DIM"},{"location":"wxd-datasets-gosales/#sls_product_brand_lookup","text":"Column Type PRODUCT_BRAND_CODE INTEGER PRODUCT_BRAND_EN VARCHAR","title":"SLS_PRODUCT_BRAND_LOOKUP"},{"location":"wxd-datasets-gosales/#sls_product_color_lookup","text":"Column Type PRODUCT_COLOR_CODE INTEGER PRODUCT_COLOR_EN VARCHAR","title":"SLS_PRODUCT_COLOR_LOOKUP"},{"location":"wxd-datasets-gosales/#sls_product_dim","text":"Column Type PRODUCT_KEY INTEGER PRODUCT_LINE_CODE INTEGER PRODUCT_TYPE_KEY INTEGER PRODUCT_TYPE_CODE INTEGER PRODUCT_NUMBER INTEGER BASE_PRODUCT_KEY INTEGER BASE_PRODUCT_NUMBER INTEGER PRODUCT_COLOR_CODE INTEGER PRODUCT_SIZE_CODE INTEGER PRODUCT_BRAND_KEY INTEGER PRODUCT_BRAND_CODE INTEGER PRODUCT_IMAGE VARCHAR INTRODUCTION_DATE DATE DISCONTINUED_DATE DATE","title":"SLS_PRODUCT_DIM"},{"location":"wxd-datasets-gosales/#sls_product_line_lookup","text":"Column Type PRODUCT_LINE_CODE INTEGER PRODUCT_LINE_EN VARCHAR","title":"SLS_PRODUCT_LINE_LOOKUP"},{"location":"wxd-datasets-gosales/#sls_product_lookup","text":"Column Type PRODUCT_NUMBER INTEGER PRODUCT_LANGUAGE VARCHAR PRODUCT_NAME VARCHAR PRODUCT_DESCRIPTION VARCHAR","title":"SLS_PRODUCT_LOOKUP"},{"location":"wxd-datasets-gosales/#sls_product_size_lookup","text":"Column Type PRODUCT_SIZE_CODE INTEGER PRODUCT_SIZE_EN VARCHAR","title":"SLS_PRODUCT_SIZE_LOOKUP"},{"location":"wxd-datasets-gosales/#sls_product_type_lookup","text":"Column Type PRODUCT_TYPE_CODE INTEGER PRODUCT_TYPE_EN VARCHAR","title":"SLS_PRODUCT_TYPE_LOOKUP"},{"location":"wxd-datasets-gosales/#sls_rtl_dim","text":"Column Type RETAILER_SITE_KEY INTEGER RETAILER_SITE_CODE INTEGER RETAILER_KEY INTEGER RETAILER_CODE INTEGER RETAILER_NAME VARCHAR RETAILER_NAME_MB VARCHAR RETAILER_CONTACT_CODE INTEGER CONTACT_FIRST_NAME VARCHAR CONTACT_LAST_NAME VARCHAR GENDER_CODE SMALLINT CONTACT_PHONE_NUMBER VARCHAR CONTACT_EXTENSION VARCHAR CONTACT_FAX VARCHAR CONTACT_EMAIL VARCHAR RTL_ADDRESS1 VARCHAR RTL_ADDRESS2 VARCHAR RTL_CITY VARCHAR RTL_PROV_STATE VARCHAR CONTACT_FIRST_NAME_MB VARCHAR CONTACT_LAST_NAME_MB VARCHAR RTL_ADDRESS1_MB VARCHAR RTL_ADDRESS2_MB VARCHAR RTL_CITY_MB VARCHAR RTL_PROV_STATE_MB VARCHAR RTL_POSTAL_ZONE VARCHAR RTL_COUNTRY_CODE INTEGER RETAILER_START_DATE DATE RETAILER_TYPE_CODE INTEGER RETAILER_TYPE_EN VARCHAR JOB_POSITION_EN VARCHAR","title":"SLS_RTL_DIM"},{"location":"wxd-datasets-gosales/#sls_sales_fact","text":"Column Type ORDER_DAY_KEY INTEGER ORGANIZATION_KEY INTEGER EMPLOYEE_KEY INTEGER RETAILER_KEY INTEGER RETAILER_SITE_KEY INTEGER PRODUCT_KEY INTEGER PROMOTION_KEY INTEGER ORDER_METHOD_KEY INTEGER SALES_ORDER_KEY INTEGER SHIP_DAY_KEY INTEGER CLOSE_DAY_KEY INTEGER QUANTITY BIGINT UNIT_COST DECIMAL UNIT_PRICE DECIMAL UNIT_SALE_PRICE DECIMAL GROSS_MARGIN FLOAT SALE_TOTAL DECIMAL GROSS_PROFIT DECIMAL","title":"SLS_SALES_FACT"},{"location":"wxd-datasets-gosales/#sls_sales_order_dim","text":"Column Type SALES_ORDER_KEY INTEGER ORDER_DETAIL_CODE INTEGER ORDER_NUMBER INTEGER WAREHOUSE_BRANCH_CODE INTEGER","title":"SLS_SALES_ORDER_DIM"},{"location":"wxd-datasets-gosales/#sls_sales_targ_fact","text":"Column Type MONTH_KEY INTEGER ORGANIZATION_KEY INTEGER RTL_COUNTRY_KEY INTEGER EMPLOYEE_KEY INTEGER RETAILER_KEY INTEGER PRODUCT_TYPE_KEY INTEGER PRODUCT_BRAND_KEY INTEGER SALES_TARGET DECIMAL","title":"SLS_SALES_TARG_FACT"},{"location":"wxd-datasets-intro/","text":"Datasets There are three datasets that have been loaded into watsonx.data system for you to use while exploring the features of the product. Great Outdoors Company warehouse data Airline On-Time performance Taxi fares Data Location The datasets found above have already been preloaded into the system, so there is no need to run the scripts below unless you want to modify the schemas or location of the data. The data files can be found in the /sampledata directory. Underneath this directory you will find datasets in three different formats: Parquet - Data that has been formatted in Parquet format that can be loaded directly into Hive and queried by watsonx.data. Relational - Data that is in a delimited format that can be loaded into Db2 or PostgreSQL databases. CSV - Comma separated values that can be converted to multiple formats or used by watsonx.data. Within the Parquet and Relational directories are SQL statements that can be used to catalog and load the data into the different systems.","title":"Dataset Overview"},{"location":"wxd-datasets-intro/#datasets","text":"There are three datasets that have been loaded into watsonx.data system for you to use while exploring the features of the product. Great Outdoors Company warehouse data Airline On-Time performance Taxi fares","title":"Datasets"},{"location":"wxd-datasets-intro/#data-location","text":"The datasets found above have already been preloaded into the system, so there is no need to run the scripts below unless you want to modify the schemas or location of the data. The data files can be found in the /sampledata directory. Underneath this directory you will find datasets in three different formats: Parquet - Data that has been formatted in Parquet format that can be loaded directly into Hive and queried by watsonx.data. Relational - Data that is in a delimited format that can be loaded into Db2 or PostgreSQL databases. CSV - Comma separated values that can be converted to multiple formats or used by watsonx.data. Within the Parquet and Relational directories are SQL statements that can be used to catalog and load the data into the different systems.","title":"Data Location"},{"location":"wxd-datasets-load/","text":"External Datasets There are a variety of data sets available for you to load from external sites. Check out the following websites for a variety of public data sets that you can use. Awesome Public Datasets Kaggle Datasets US Data.Gov UCI Machine Learning Repository US Fuel Economy Note : These sites have not been checked for license restrictions on the use of the data. You are responsible for checking that the data can be used without any licensing requirements. Loading your own data You can use a browser or link to an external file repository (i.e., Box) and download data directly to your workstation. Data can be CSV, Parquet, JSON, or TXT formats. Once the data is on your workstation, use the following steps. Note : You cannot import customer data nor any data that has restrictions associated with its use. Any use of private data is in violation of the terms and conditions of using this image. The first step is to connect to MinIO. Extract the MinIO credentials by using the passwords command: passwords Open your browser and navigate to the MinIO console. Login with object store credentials found above (These will be different for your system). You should see the current buckets in MinIO. If you don't see the buckets, click on Object Browser on the left-hand side panel. Select hive-bucket from the list of buckets. You may see other directories in this list than what is shown above. You will need to create a new path for your data set. Create a new directory name for your data (fuel_economy was used for this example). MinIO will display an empty directory and suggest you load something into it. Use the Upload button on the far right side to point to your dataset on your local machine. In this example, we are using a CSV file for the 2013 fuel economy estimates for automobiles sold in the US. You may need to rename you datasets to eliminate blanks and any other special characters other than \" _ \" or \" - \". The display will show the progress of the upload into the bucket. You may need to refresh your browser to see the file in the bucket. Now that the data has been loaded into a bucket, you can catalog it in the watsonx.data UI. If you created a new bucket for this data set, you will need to register it first in the watsonx.data UI. Instructions for how to do this are found in the Working with Object Store Buckets section. Start by navigating to the watsonx.data UI and look at the Infrastructure manager. Find the bucket where you upload your data set into and note the catalog name that it is associated with. Here we can see that the hive-bucket bucket is associated with the hive_data catalog. In the watsonx.data UI, select the Query workspace (SQL) icon. You will need to create a schema that links to this data set. The format of the command is shown below. DROP SCHEMA catalog.schema_name; CREATE SCHEMA catalog.schema_name WITH ( location = 's3a://your-bucket/data_directory' ); You will need to change the following values: catalog - The catalog that the bucket you are using is associated with schema_name - A schema name to associate your tables with data_directory - The directory in which your file is located your_bucket - The bucket the data physically resides in For the fuel economy example, using mpg as the schema, the SQL would be: DROP SCHEMA hive_data.mpg; CREATE SCHEMA hive_data.mpg WITH ( location = 's3a://hive-bucket/fuel_economy' ); Run this SQL against the Presto engine: The DROP command may fail if the schema doesn't exist, but the CREATE should work. The next step is to define what the table looks like for watsonx.data to be able to query it. The syntax of the CREATE TABLE statement is similar to: CREATE TABLE catalog.schema.tablename ( \"column_name\" \"type\", ... ) WITH ( format = 'CSV', csv_separator = ',', external_location = 's3a://your_bucket/data_directory'); You will need to create a table definition for your CSV file in order to catalog it in watsonx.data. Note that the only data type that is permitted for CSV columns is varchar . This is a restriction of the current driver. Plans are to update it to include other data types over time. If your data set does not include a header row (a row that defines the column names), you will need to create the table definition manually. If the data set does contain a header record, you can use the following Python code to generate a CREATE TABLE statement. You will need to make sure that pandas is available. python3 -m pip install pandas --user Next run the python3 command in the shell to run an interactive Python session. python3 Place the following code into your Python window. def showcsv(catalog, schema, tablename, bucket, directory, csv_in): import pandas as pd df = pd.read_csv(csv_in,na_values=\"-\") df = df.fillna(0) column_headers = list(df.columns.values) print(\"\") print(f\"DROP TABLE IF EXISTS {catalog}.{schema}.{tablename};\") print(f\"CREATE TABLE {catalog}.{schema}.{tablename}\") print(\" (\") comma = \"\" end = \"\" for header in column_headers: print(f\"{comma}\",end=end) comma = \",\" end = \"\\n\" print(f' \"{header}\" varchar',end=\"\") print(f\" )\") print(f\"WITH (\") print(f\" format = 'CSV',\") print(f\" csv_separator = ',',\") print(f\" external_location = 's3a://{bucket}/{directory}'\") print(f\" );\") print(\"\") def makesql(): catalog = input(\"Catalog : \") schema = input(\"Schema : \") table = input(\"Table : \") bucket = input(\"Bucket : \") dir = input(\"Directory : \") csv = input(\"CSV File : \") showcsv(catalog,schema,table,bucket,dir,csv) Gather the following information on your dataset: catalog - The catalog the schema and table are created under ( hive_data ) schema - The schema name that you created to hold your table ( mpg ) table name - The name of the table ( fuel_economy ) bucket - Where the data is located ( hive-bucket ) directory - What directory contains your data ( fuel_economy ) csv_in - The location on your local machine where the csv file is Once you have gathered that, run the following command in your Python window and answer the prompts. makesql() >>> makesql() Catalog : hive_data Schema : mpg Table : fueleconomy Bucket : hive-bucket Directory : fuel_economy CSV File : ~/Downloads/fuel_economy_2013.csv DROP TABLE IF EXISTS hive_data.mpg.fueleconomy; CREATE TABLE hive_data.mpg.fueleconomy ( \"MODEL_YEAR\" varchar, \"MFR_NAME\" varchar, \"DIVISION\" varchar, \"CARLINE\" varchar, \"ENG_DISPL\" varchar, \"CYL\" varchar, \"TRANS_IN_FE_GUIDE\" varchar, \"CITY_FE_CONVENTIONAL_FUEL\" varchar, \"HWY_FE_CONVENTIONAL_FUEL\" varchar, \"COMB_FE_CONVENTIONAL_FUEL\" varchar, \"AIR_ASPIRATION_DESC\" varchar, \"TRANS_DESC\" varchar, \"GEARS\" varchar, \"DRIVE_DESC\" varchar, \"FUEL_UNIT_CONVENTIONAL_FUEL\" varchar, \"FUEL_UNIT_DESC_CONVENTIONAL_FUEL\" varchar, \"ANNUAL_FUEL_COST_CONVENTIONAL\" varchar, \"FUEL_METERING_SYS_DESC\" varchar ) WITH ( format = 'CSV', csv_separator = ',', external_location = 's3a://hive-bucket/fuel_economy' ); Cut and paste the output from the command into the watsonx.data Data Explorer window to create the file. Now you can query the table with the following SQL. Note that the header record still exists in the answer set since we did not remove it from the CSV file. SELECT * FROM hive_data.mpg.fueleconomy LIMIT 10","title":"Loading External Datasets"},{"location":"wxd-datasets-load/#external-datasets","text":"There are a variety of data sets available for you to load from external sites. Check out the following websites for a variety of public data sets that you can use. Awesome Public Datasets Kaggle Datasets US Data.Gov UCI Machine Learning Repository US Fuel Economy Note : These sites have not been checked for license restrictions on the use of the data. You are responsible for checking that the data can be used without any licensing requirements.","title":"External Datasets"},{"location":"wxd-datasets-load/#loading-your-own-data","text":"You can use a browser or link to an external file repository (i.e., Box) and download data directly to your workstation. Data can be CSV, Parquet, JSON, or TXT formats. Once the data is on your workstation, use the following steps. Note : You cannot import customer data nor any data that has restrictions associated with its use. Any use of private data is in violation of the terms and conditions of using this image. The first step is to connect to MinIO. Extract the MinIO credentials by using the passwords command: passwords Open your browser and navigate to the MinIO console. Login with object store credentials found above (These will be different for your system). You should see the current buckets in MinIO. If you don't see the buckets, click on Object Browser on the left-hand side panel. Select hive-bucket from the list of buckets. You may see other directories in this list than what is shown above. You will need to create a new path for your data set. Create a new directory name for your data (fuel_economy was used for this example). MinIO will display an empty directory and suggest you load something into it. Use the Upload button on the far right side to point to your dataset on your local machine. In this example, we are using a CSV file for the 2013 fuel economy estimates for automobiles sold in the US. You may need to rename you datasets to eliminate blanks and any other special characters other than \" _ \" or \" - \". The display will show the progress of the upload into the bucket. You may need to refresh your browser to see the file in the bucket. Now that the data has been loaded into a bucket, you can catalog it in the watsonx.data UI. If you created a new bucket for this data set, you will need to register it first in the watsonx.data UI. Instructions for how to do this are found in the Working with Object Store Buckets section. Start by navigating to the watsonx.data UI and look at the Infrastructure manager. Find the bucket where you upload your data set into and note the catalog name that it is associated with. Here we can see that the hive-bucket bucket is associated with the hive_data catalog. In the watsonx.data UI, select the Query workspace (SQL) icon. You will need to create a schema that links to this data set. The format of the command is shown below. DROP SCHEMA catalog.schema_name; CREATE SCHEMA catalog.schema_name WITH ( location = 's3a://your-bucket/data_directory' ); You will need to change the following values: catalog - The catalog that the bucket you are using is associated with schema_name - A schema name to associate your tables with data_directory - The directory in which your file is located your_bucket - The bucket the data physically resides in For the fuel economy example, using mpg as the schema, the SQL would be: DROP SCHEMA hive_data.mpg; CREATE SCHEMA hive_data.mpg WITH ( location = 's3a://hive-bucket/fuel_economy' ); Run this SQL against the Presto engine: The DROP command may fail if the schema doesn't exist, but the CREATE should work. The next step is to define what the table looks like for watsonx.data to be able to query it. The syntax of the CREATE TABLE statement is similar to: CREATE TABLE catalog.schema.tablename ( \"column_name\" \"type\", ... ) WITH ( format = 'CSV', csv_separator = ',', external_location = 's3a://your_bucket/data_directory'); You will need to create a table definition for your CSV file in order to catalog it in watsonx.data. Note that the only data type that is permitted for CSV columns is varchar . This is a restriction of the current driver. Plans are to update it to include other data types over time. If your data set does not include a header row (a row that defines the column names), you will need to create the table definition manually. If the data set does contain a header record, you can use the following Python code to generate a CREATE TABLE statement. You will need to make sure that pandas is available. python3 -m pip install pandas --user Next run the python3 command in the shell to run an interactive Python session. python3 Place the following code into your Python window. def showcsv(catalog, schema, tablename, bucket, directory, csv_in): import pandas as pd df = pd.read_csv(csv_in,na_values=\"-\") df = df.fillna(0) column_headers = list(df.columns.values) print(\"\") print(f\"DROP TABLE IF EXISTS {catalog}.{schema}.{tablename};\") print(f\"CREATE TABLE {catalog}.{schema}.{tablename}\") print(\" (\") comma = \"\" end = \"\" for header in column_headers: print(f\"{comma}\",end=end) comma = \",\" end = \"\\n\" print(f' \"{header}\" varchar',end=\"\") print(f\" )\") print(f\"WITH (\") print(f\" format = 'CSV',\") print(f\" csv_separator = ',',\") print(f\" external_location = 's3a://{bucket}/{directory}'\") print(f\" );\") print(\"\") def makesql(): catalog = input(\"Catalog : \") schema = input(\"Schema : \") table = input(\"Table : \") bucket = input(\"Bucket : \") dir = input(\"Directory : \") csv = input(\"CSV File : \") showcsv(catalog,schema,table,bucket,dir,csv) Gather the following information on your dataset: catalog - The catalog the schema and table are created under ( hive_data ) schema - The schema name that you created to hold your table ( mpg ) table name - The name of the table ( fuel_economy ) bucket - Where the data is located ( hive-bucket ) directory - What directory contains your data ( fuel_economy ) csv_in - The location on your local machine where the csv file is Once you have gathered that, run the following command in your Python window and answer the prompts. makesql() >>> makesql() Catalog : hive_data Schema : mpg Table : fueleconomy Bucket : hive-bucket Directory : fuel_economy CSV File : ~/Downloads/fuel_economy_2013.csv DROP TABLE IF EXISTS hive_data.mpg.fueleconomy; CREATE TABLE hive_data.mpg.fueleconomy ( \"MODEL_YEAR\" varchar, \"MFR_NAME\" varchar, \"DIVISION\" varchar, \"CARLINE\" varchar, \"ENG_DISPL\" varchar, \"CYL\" varchar, \"TRANS_IN_FE_GUIDE\" varchar, \"CITY_FE_CONVENTIONAL_FUEL\" varchar, \"HWY_FE_CONVENTIONAL_FUEL\" varchar, \"COMB_FE_CONVENTIONAL_FUEL\" varchar, \"AIR_ASPIRATION_DESC\" varchar, \"TRANS_DESC\" varchar, \"GEARS\" varchar, \"DRIVE_DESC\" varchar, \"FUEL_UNIT_CONVENTIONAL_FUEL\" varchar, \"FUEL_UNIT_DESC_CONVENTIONAL_FUEL\" varchar, \"ANNUAL_FUEL_COST_CONVENTIONAL\" varchar, \"FUEL_METERING_SYS_DESC\" varchar ) WITH ( format = 'CSV', csv_separator = ',', external_location = 's3a://hive-bucket/fuel_economy' ); Cut and paste the output from the command into the watsonx.data Data Explorer window to create the file. Now you can query the table with the following SQL. Note that the header record still exists in the answer set since we did not remove it from the CSV file. SELECT * FROM hive_data.mpg.fueleconomy LIMIT 10","title":"Loading your own data"},{"location":"wxd-datasets-ontime/","text":"On-Time Performance Dataset The Airline On-Time performance database contains information on flights within the US from 1987 through 2020. This is a very large dataset, so only the records from January 2013 have been included inside this image. The following link provides more information on the dataset and the columns that are found in the records. Note that in the version of the data used in this system does not contain the diversion records 1 through 5. These fields are blank in the data sample used. Note that the initial diversion airport does exist in the record. Airline Report On-Time Performance Dataset Disclaimer Except as expressly set forth in this agreement, the data (including enhanced data) is provided on an \"as is\" basis, without warranties or conditions of any kind, either express or implied including, without limitation, any warranties or conditions of title, non-infringement, merchantability or fitness for a particular purpose. Neither you nor any data providers shall have any liability for any direct, indirect, incidental, special, exemplary, or consequential damages (including without limitation lost profits), however caused and on any theory of liability, whether in contract, strict liability, or tort (including negligence or otherwise) arising in any way out of the use or distribution of the data or the exercise of any rights granted hereunder, even if advised of the possibility of such damages. Tables AIRCRAFT Column Type TAIL_NUMBER VARCHAR MANUFACTURER VARCHAR MODEL VARCHAR AIRLINE_ID Column Type Code INT Description VARCHAR AIRPORT_ID Column Type Code INT Description VARCHAR CANCELLATION Column Type Code INT Description VARCHAR ONTIME Column Type Year INT Quarter INT Month INT DayofMonth INT DayOfWeek INT FlightDate VARCHAR Reporting_Airline VARCHAR DOT_ID_Reporting_Airline INT IATA_CODE_Reporting_Airline VARCHAR Tail_Number VARCHAR Flight_Number_Reporting_Airline INT OriginAirportID INT OriginAirportSeqID INT OriginCityMarketID INT Origin VARCHAR OriginCityName VARCHAR OriginState VARCHAR OriginStateFips VARCHAR OriginStateName VARCHAR OriginWac INT DestAirportID INT DestAirportSeqID INT DestCityMarketID INT Dest VARCHAR DestCityName VARCHAR DestState VARCHAR DestStateFips VARCHAR DestStateName VARCHAR DestWac INT CRSDepTime INT DepTime INT DepDelay INT DepDelayMinutes INT DepDel15 INT DepartureDelayGroups INT DepTimeBlk VARCHAR TaxiOut INT WheelsOff INT WheelsOn INT TaxiIn INT CRSArrTime INT ArrTime INT ArrDelay INT ArrDelayMinutes INT ArrDel15 INT ArrivalDelayGroups INT ArrTimeBlk VARCHAR Cancelled INT CancellationCode INT Diverted INT CRSElapsedTime INT ActualElapsedTime INT AirTime smallINT Flights INT Distance INT DistanceGroup INT CarrierDelay INT WeatherDelay INT NASDelay INT SecurityDelay INT LateAircraftDelay INT FirstDepTime INT TotalAddGTime INT LongestAddGTime INT DivAirportLandings INT DivReachedDest INT DivActualElapsedTime INT DivArrDelay INT DivDistance INT DivAirport VARCHAR","title":"Ontime Flight Performance"},{"location":"wxd-datasets-ontime/#on-time-performance-dataset","text":"The Airline On-Time performance database contains information on flights within the US from 1987 through 2020. This is a very large dataset, so only the records from January 2013 have been included inside this image. The following link provides more information on the dataset and the columns that are found in the records. Note that in the version of the data used in this system does not contain the diversion records 1 through 5. These fields are blank in the data sample used. Note that the initial diversion airport does exist in the record. Airline Report On-Time Performance Dataset","title":"On-Time Performance Dataset"},{"location":"wxd-datasets-ontime/#disclaimer","text":"Except as expressly set forth in this agreement, the data (including enhanced data) is provided on an \"as is\" basis, without warranties or conditions of any kind, either express or implied including, without limitation, any warranties or conditions of title, non-infringement, merchantability or fitness for a particular purpose. Neither you nor any data providers shall have any liability for any direct, indirect, incidental, special, exemplary, or consequential damages (including without limitation lost profits), however caused and on any theory of liability, whether in contract, strict liability, or tort (including negligence or otherwise) arising in any way out of the use or distribution of the data or the exercise of any rights granted hereunder, even if advised of the possibility of such damages.","title":"Disclaimer"},{"location":"wxd-datasets-ontime/#tables","text":"","title":"Tables"},{"location":"wxd-datasets-ontime/#aircraft","text":"Column Type TAIL_NUMBER VARCHAR MANUFACTURER VARCHAR MODEL VARCHAR","title":"AIRCRAFT"},{"location":"wxd-datasets-ontime/#airline_id","text":"Column Type Code INT Description VARCHAR","title":"AIRLINE_ID"},{"location":"wxd-datasets-ontime/#airport_id","text":"Column Type Code INT Description VARCHAR","title":"AIRPORT_ID"},{"location":"wxd-datasets-ontime/#cancellation","text":"Column Type Code INT Description VARCHAR","title":"CANCELLATION"},{"location":"wxd-datasets-ontime/#ontime","text":"Column Type Year INT Quarter INT Month INT DayofMonth INT DayOfWeek INT FlightDate VARCHAR Reporting_Airline VARCHAR DOT_ID_Reporting_Airline INT IATA_CODE_Reporting_Airline VARCHAR Tail_Number VARCHAR Flight_Number_Reporting_Airline INT OriginAirportID INT OriginAirportSeqID INT OriginCityMarketID INT Origin VARCHAR OriginCityName VARCHAR OriginState VARCHAR OriginStateFips VARCHAR OriginStateName VARCHAR OriginWac INT DestAirportID INT DestAirportSeqID INT DestCityMarketID INT Dest VARCHAR DestCityName VARCHAR DestState VARCHAR DestStateFips VARCHAR DestStateName VARCHAR DestWac INT CRSDepTime INT DepTime INT DepDelay INT DepDelayMinutes INT DepDel15 INT DepartureDelayGroups INT DepTimeBlk VARCHAR TaxiOut INT WheelsOff INT WheelsOn INT TaxiIn INT CRSArrTime INT ArrTime INT ArrDelay INT ArrDelayMinutes INT ArrDel15 INT ArrivalDelayGroups INT ArrTimeBlk VARCHAR Cancelled INT CancellationCode INT Diverted INT CRSElapsedTime INT ActualElapsedTime INT AirTime smallINT Flights INT Distance INT DistanceGroup INT CarrierDelay INT WeatherDelay INT NASDelay INT SecurityDelay INT LateAircraftDelay INT FirstDepTime INT TotalAddGTime INT LongestAddGTime INT DivAirportLandings INT DivReachedDest INT DivActualElapsedTime INT DivArrDelay INT DivDistance INT DivAirport VARCHAR","title":"ONTIME"},{"location":"wxd-datasets-taxi/","text":"Chicago Taxi Data Taxi trips are reported to the City of Chicago in its role as a regulatory agency. To protect privacy but allow for aggregate analyses, the Taxi ID is consistent for any given taxi medallion number but does not show the number. The data set used in this system contains records from January 1st, 2013 and does not include the census tract value nor the Taxi ID. Taxi Trips Disclaimer This site provides applications using data that has been modified for use from its original source, www.cityofchicago.org, the official website of the City of Chicago. The City of Chicago makes no claims as to the content, accuracy, timeliness, or completeness of the data provided at this site. The data provided at this site is subject to change at any time. It is understood that the data provided at this site is being used at one\u2019s own risk. Tables TAXIRIDES Column Type TRIP_ID int COMPANY varchar DROPOFF_LATITUDE double DROPOFF_LONGITUDE double EXTRAS double FARE double PAYMENT_TYPE varchar PICKUP_LATITUDE double PICKUP_LONGITUDE double TIPS double TOLLS double TRIP_END_TIMESTAMP timestamp TRIP_MILES double TRIP_SECONDS int TRIP_START_TIMESTAMP timestamp TRIP_TOTAL double","title":"Taxi Rides"},{"location":"wxd-datasets-taxi/#chicago-taxi-data","text":"Taxi trips are reported to the City of Chicago in its role as a regulatory agency. To protect privacy but allow for aggregate analyses, the Taxi ID is consistent for any given taxi medallion number but does not show the number. The data set used in this system contains records from January 1st, 2013 and does not include the census tract value nor the Taxi ID. Taxi Trips","title":"Chicago Taxi Data"},{"location":"wxd-datasets-taxi/#disclaimer","text":"This site provides applications using data that has been modified for use from its original source, www.cityofchicago.org, the official website of the City of Chicago. The City of Chicago makes no claims as to the content, accuracy, timeliness, or completeness of the data provided at this site. The data provided at this site is subject to change at any time. It is understood that the data provided at this site is being used at one\u2019s own risk.","title":"Disclaimer"},{"location":"wxd-datasets-taxi/#tables","text":"","title":"Tables"},{"location":"wxd-datasets-taxi/#taxirides","text":"Column Type TRIP_ID int COMPANY varchar DROPOFF_LATITUDE double DROPOFF_LONGITUDE double EXTRAS double FARE double PAYMENT_TYPE varchar PICKUP_LATITUDE double PICKUP_LONGITUDE double TIPS double TOLLS double TRIP_END_TIMESTAMP timestamp TRIP_MILES double TRIP_SECONDS int TRIP_START_TIMESTAMP timestamp TRIP_TOTAL double","title":"TAXIRIDES"},{"location":"wxd-datasets/","text":"Datasets There are three datasets that have been loaded into watsonx.data system for you to use while exploring the features of the product. These links will give you more details on each of the data sets, including options for loading your own data into this environment. Great Outdoors Company warehouse data Airline On-Time performance Taxi fares For information on other sources of data and how to import that data, see the following links. Alternate Data Sets Loading External Datasets","title":"Datasets"},{"location":"wxd-datasets/#datasets","text":"There are three datasets that have been loaded into watsonx.data system for you to use while exploring the features of the product. These links will give you more details on each of the data sets, including options for loading your own data into this environment. Great Outdoors Company warehouse data Airline On-Time performance Taxi fares For information on other sources of data and how to import that data, see the following links. Alternate Data Sets Loading External Datasets","title":"Datasets"},{"location":"wxd-dbeaver/","text":"dBeaver Client Tool You could use any tool that supports connectivity through JDBC drivers to connect to watsonx.data, but we chose to use dBeaver for this lab. dBeaver is a client tool that we can use to connect to watsonx.data and execute queries etc. The tool has been installed in the watsonx users home directory. To access dBeaver, you must use the VNC service which has been installed on this server for you. Start dBeaver Locally To start dBeaver, you must be connected to the VM console of the Linux server as the watsonx user (see Accessing the Console ). In the virtual machine, click on the Applications button, choose the Database folder and click on the dBeaver icon. The start-up screen for dBeaver will display. The dBeaver program may ask if you want to create an empty database or update the release. Just say No. The first dialog from dBeaver will ask you to create a database connection. If you do not see this screen, select Database, and then select New Database Connection: Catalog watsonx.data Connection We will use the PrestoDB JDBC connector (NOT PrestoSQL). This is the other name for Trino, a variant of PrestoDB which might work. Select SQL (see Left side) and scroll down until you see PrestoDB. Select PrestoDB and then press \"Next\". The following screen will be displayed. Enter the following values into the dialog. Note : These settings are case-sensitive. Host: localhost Port: 8443 Username: ibmlhadmin Password: password Database: tpch Then select the Driver Properties tab. You might be asked to download the database driver. Make sure select \"Force Download\" otherwise it will not properly download the driver. Once downloaded it will display the Driver properties dialog. Press the [+] button on the bottom left of the User Properties list. You need to enter three properties: SSL True SSLTrustStorePath /certs/presto-key.jks SSLTrustStorePassword watsonx.data Enter the property name \"SSL\", in uppercase (the parameter is case-sensitive!). When you hit OK it will display the setting in the list. Click on the SSL field and you will update the value to True and hit Enter. Add another field called SSLTrustStorePath and give it value of /certs/presto-key.jks and finally add the SSLTrustStorePassword setting with a value of watsonx.data . The panel should now contain three values. Press Finish when done. You should now see the TPCH database on the left panel. Clicking on the >TPCH line should display the objects that are found in the database. You can now use dBeaver to navigate through the different schemas in the Presto database. The iceberg_data schema should also be visible in the dBeaver console. Open the iceberg_data catalog and search for the customer table under workshop schema. This schema will only exist if you created it in the previous section on MinIO.","title":"dBeaver"},{"location":"wxd-dbeaver/#dbeaver-client-tool","text":"You could use any tool that supports connectivity through JDBC drivers to connect to watsonx.data, but we chose to use dBeaver for this lab. dBeaver is a client tool that we can use to connect to watsonx.data and execute queries etc. The tool has been installed in the watsonx users home directory. To access dBeaver, you must use the VNC service which has been installed on this server for you.","title":"dBeaver Client Tool"},{"location":"wxd-dbeaver/#start-dbeaver-locally","text":"To start dBeaver, you must be connected to the VM console of the Linux server as the watsonx user (see Accessing the Console ). In the virtual machine, click on the Applications button, choose the Database folder and click on the dBeaver icon. The start-up screen for dBeaver will display. The dBeaver program may ask if you want to create an empty database or update the release. Just say No. The first dialog from dBeaver will ask you to create a database connection. If you do not see this screen, select Database, and then select New Database Connection:","title":"Start dBeaver Locally"},{"location":"wxd-dbeaver/#catalog-watsonxdata-connection","text":"We will use the PrestoDB JDBC connector (NOT PrestoSQL). This is the other name for Trino, a variant of PrestoDB which might work. Select SQL (see Left side) and scroll down until you see PrestoDB. Select PrestoDB and then press \"Next\". The following screen will be displayed. Enter the following values into the dialog. Note : These settings are case-sensitive. Host: localhost Port: 8443 Username: ibmlhadmin Password: password Database: tpch Then select the Driver Properties tab. You might be asked to download the database driver. Make sure select \"Force Download\" otherwise it will not properly download the driver. Once downloaded it will display the Driver properties dialog. Press the [+] button on the bottom left of the User Properties list. You need to enter three properties: SSL True SSLTrustStorePath /certs/presto-key.jks SSLTrustStorePassword watsonx.data Enter the property name \"SSL\", in uppercase (the parameter is case-sensitive!). When you hit OK it will display the setting in the list. Click on the SSL field and you will update the value to True and hit Enter. Add another field called SSLTrustStorePath and give it value of /certs/presto-key.jks and finally add the SSLTrustStorePassword setting with a value of watsonx.data . The panel should now contain three values. Press Finish when done. You should now see the TPCH database on the left panel. Clicking on the >TPCH line should display the objects that are found in the database. You can now use dBeaver to navigate through the different schemas in the Presto database. The iceberg_data schema should also be visible in the dBeaver console. Open the iceberg_data catalog and search for the customer table under workshop schema. This schema will only exist if you created it in the previous section on MinIO.","title":"Catalog watsonx.data Connection"},{"location":"wxd-disclaimer/","text":"Disclaimer Watson.data Copyright \u00a9 2024 by International Business Machines Corporation (IBM). All rights reserved. Printed in Canada. Except as permitted under the Copyright Act of 1976, no part of this publication may be reproduced or distributed in any form or by any means, or stored in a database or retrieval system, without the prior written permission of IBM, with the exception that the program listings may be entered, stored, and executed in a computer system, but they may not be reproduced for publication. The contents of this lab represent those features that may or may not be available in the current release of any products mentioned within this lab despite what the lab may say. IBM reserves the right to include or exclude any functionality mentioned in this lab for the current release of watsonx.data, or a subsequent release. In addition, any claims made in this lab are not official communications by IBM; rather, they are observed by the authors in unaudited testing and research. The views expressed in this lab is those of the authors and not necessarily those of the IBM Corporation; both are not liable for any of the claims, assertions, or contents in this lab. IBM's statements regarding its plans, directions, and intent are subject to change or withdrawal without notice and at IBM's sole discretion. Information regarding potential future products is intended to outline our general product direction and it should not be relied on in making a purchasing decision. The information mentioned regarding potential future products is not a commitment, promise, or legal obligation to deliver any material, code, or functionality. Information about potential future products may not be incorporated into any contract. The development, release, and timing of any future feature or functionality described for our products remains at our sole discretion. Performance is based on measurements and projections using standard IBM benchmarks in a controlled environment. The actual throughput or performance that any user will experience will vary depending upon many factors, including considerations such as the amount of multiprogramming in the user's job stream, the I/O configuration, the storage configuration, and the workload processed. Therefore, no assurance can be given that an individual user will achieve results like those stated here. U.S. Government Users Restricted Rights - Use, duplication or disclosure restricted by GSA ADP Schedule Contract with IBM. Information in this eBook (including information relating to products that have not yet been announced by IBM) has been reviewed for accuracy as of the date of initial publication and could include unintentional technical or typographical errors. IBM shall have no responsibility to update this information. THIS DOCUMENT IS DISTRIBUTED \"AS IS\" WITHOUT ANY WARRANTY, EITHER EXPRESS OR IMPLIED. IN NO EVENT SHALL IBM BE LIABLE FOR ANY DAMAGE ARISING FROM THE USE OF THIS INFORMATION, INCLUDING BUT NOT LIMITED TO, LOSS OF DATA, BUSINESS INTERRUPTION, LOSS OF PROFIT OR LOSS OF OPPORTUNITY. IBM products and services are warranted according to the terms and conditions of the agreements under which they are provided. References in this document to IBM products, programs, or services does not imply that IBM intends to make such products, programs, or services available in all countries in which IBM operates or does business. Information concerning non-IBM products was obtained from the suppliers of those products, their published announcements, or other publicly available sources. IBM has not tested those products in connection with this publication and cannot confirm the accuracy of performance, compatibility or any other claims related to non-IBM products. Questions on the capabilities of non-IBM products should be addressed to the suppliers of those products. IBM does not warrant the quality of any third-party products, or the ability of any such third-party products to interoperate with IBM's products. IBM EXPRESSLY DISCLAIMS ALL WARRANTIES, EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. The provision of the information contained herein is not intended to, and does not, grant any right or license under any IBM patents, copyrights, trademarks, or other intellectual property right. IBM, the IBM logo, ibm.com, Aspera\u00ae, Bluemix, Blueworks Live, CICS, Clearcase, Cognos\u00ae, DOORS\u00ae, Emptoris\u00ae, Enterprise Document Management System\u2122, FASP\u00ae, FileNet\u00ae, Global Business Services \u00ae, Global Technology Services \u00ae, IBM ExperienceOne\u2122, IBM SmartCloud\u00ae, IBM Social Business\u00ae, Information on Demand, ILOG, Maximo\u00ae, MQIntegrator\u00ae, MQSeries\u00ae, Netcool\u00ae, OMEGAMON, OpenPower, PureAnalytics\u2122, PureApplication\u00ae, pureCluster\u2122, PureCoverage\u00ae, PureData\u00ae, PureExperience\u00ae, PureFlex\u00ae, pureQuery\u00ae, pureScale\u00ae, PureSystems\u00ae, QRadar\u00ae, Rational\u00ae, Rhapsody\u00ae, Smarter Commerce\u00ae, SoDA, SPSS, Sterling Commerce\u00ae, StoredIQ, Tealeaf\u00ae, Tivoli\u00ae, Trusteer\u00ae, Unica\u00ae, urban{code}\u00ae, Watson, WebSphere\u00ae, Worklight\u00ae, X-Force\u00ae and System z\u00ae Z/OS, are trademarks of International Business Machines Corporation, registered in many jurisdictions worldwide. Other product and service names might be trademarks of IBM or other companies. A current list of IBM trademarks is available on the Web at \"Copyright and trademark information\" at: www.ibm.com/legal/copytrade.shtml. All trademarks or copyrights mentioned herein are the possession of their respective owners and IBM makes no claim of ownership by the mention of products that contain these marks.","title":"Disclaimer"},{"location":"wxd-disclaimer/#disclaimer","text":"","title":"Disclaimer"},{"location":"wxd-disclaimer/#watsondata","text":"Copyright \u00a9 2024 by International Business Machines Corporation (IBM). All rights reserved. Printed in Canada. Except as permitted under the Copyright Act of 1976, no part of this publication may be reproduced or distributed in any form or by any means, or stored in a database or retrieval system, without the prior written permission of IBM, with the exception that the program listings may be entered, stored, and executed in a computer system, but they may not be reproduced for publication. The contents of this lab represent those features that may or may not be available in the current release of any products mentioned within this lab despite what the lab may say. IBM reserves the right to include or exclude any functionality mentioned in this lab for the current release of watsonx.data, or a subsequent release. In addition, any claims made in this lab are not official communications by IBM; rather, they are observed by the authors in unaudited testing and research. The views expressed in this lab is those of the authors and not necessarily those of the IBM Corporation; both are not liable for any of the claims, assertions, or contents in this lab. IBM's statements regarding its plans, directions, and intent are subject to change or withdrawal without notice and at IBM's sole discretion. Information regarding potential future products is intended to outline our general product direction and it should not be relied on in making a purchasing decision. The information mentioned regarding potential future products is not a commitment, promise, or legal obligation to deliver any material, code, or functionality. Information about potential future products may not be incorporated into any contract. The development, release, and timing of any future feature or functionality described for our products remains at our sole discretion. Performance is based on measurements and projections using standard IBM benchmarks in a controlled environment. The actual throughput or performance that any user will experience will vary depending upon many factors, including considerations such as the amount of multiprogramming in the user's job stream, the I/O configuration, the storage configuration, and the workload processed. Therefore, no assurance can be given that an individual user will achieve results like those stated here. U.S. Government Users Restricted Rights - Use, duplication or disclosure restricted by GSA ADP Schedule Contract with IBM. Information in this eBook (including information relating to products that have not yet been announced by IBM) has been reviewed for accuracy as of the date of initial publication and could include unintentional technical or typographical errors. IBM shall have no responsibility to update this information. THIS DOCUMENT IS DISTRIBUTED \"AS IS\" WITHOUT ANY WARRANTY, EITHER EXPRESS OR IMPLIED. IN NO EVENT SHALL IBM BE LIABLE FOR ANY DAMAGE ARISING FROM THE USE OF THIS INFORMATION, INCLUDING BUT NOT LIMITED TO, LOSS OF DATA, BUSINESS INTERRUPTION, LOSS OF PROFIT OR LOSS OF OPPORTUNITY. IBM products and services are warranted according to the terms and conditions of the agreements under which they are provided. References in this document to IBM products, programs, or services does not imply that IBM intends to make such products, programs, or services available in all countries in which IBM operates or does business. Information concerning non-IBM products was obtained from the suppliers of those products, their published announcements, or other publicly available sources. IBM has not tested those products in connection with this publication and cannot confirm the accuracy of performance, compatibility or any other claims related to non-IBM products. Questions on the capabilities of non-IBM products should be addressed to the suppliers of those products. IBM does not warrant the quality of any third-party products, or the ability of any such third-party products to interoperate with IBM's products. IBM EXPRESSLY DISCLAIMS ALL WARRANTIES, EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. The provision of the information contained herein is not intended to, and does not, grant any right or license under any IBM patents, copyrights, trademarks, or other intellectual property right. IBM, the IBM logo, ibm.com, Aspera\u00ae, Bluemix, Blueworks Live, CICS, Clearcase, Cognos\u00ae, DOORS\u00ae, Emptoris\u00ae, Enterprise Document Management System\u2122, FASP\u00ae, FileNet\u00ae, Global Business Services \u00ae, Global Technology Services \u00ae, IBM ExperienceOne\u2122, IBM SmartCloud\u00ae, IBM Social Business\u00ae, Information on Demand, ILOG, Maximo\u00ae, MQIntegrator\u00ae, MQSeries\u00ae, Netcool\u00ae, OMEGAMON, OpenPower, PureAnalytics\u2122, PureApplication\u00ae, pureCluster\u2122, PureCoverage\u00ae, PureData\u00ae, PureExperience\u00ae, PureFlex\u00ae, pureQuery\u00ae, pureScale\u00ae, PureSystems\u00ae, QRadar\u00ae, Rational\u00ae, Rhapsody\u00ae, Smarter Commerce\u00ae, SoDA, SPSS, Sterling Commerce\u00ae, StoredIQ, Tealeaf\u00ae, Tivoli\u00ae, Trusteer\u00ae, Unica\u00ae, urban{code}\u00ae, Watson, WebSphere\u00ae, Worklight\u00ae, X-Force\u00ae and System z\u00ae Z/OS, are trademarks of International Business Machines Corporation, registered in many jurisdictions worldwide. Other product and service names might be trademarks of IBM or other companies. A current list of IBM trademarks is available on the Web at \"Copyright and trademark information\" at: www.ibm.com/legal/copytrade.shtml. All trademarks or copyrights mentioned herein are the possession of their respective owners and IBM makes no claim of ownership by the mention of products that contain these marks.","title":"Watson.data"},{"location":"wxd-federation/","text":"Federation with watsonx.data Watsonx.data can federate data from other data sources, there are a few out of box connectors and one could create additional connectors using the SDK if need be (This does involve some programming and testing effort) and not a trivial exercise. We will use the existing PostgreSQL instance, add some data, and test the federation capabilities. Open the developer sandbox and use existing scripts to create a PostgreSQL database and add some data. Switch to the bin directory as the root user. cd /root/ibm-lh-dev/bin Connect to the sandbox. ./dev-sandbox.sh Create the database. /scripts/create_db.sh pgdatadb exists result: CREATE DATABASE Connect to the Database.quit; /scripts/runsql.sh pgdatadb psql (11.19, server 13.4 (Debian 13.4-4.pgdg110+1)) WARNING: psql major version 11, server major version 13. Some psql features might not work. Type \"help\" for help. Create a Table. create table t1( c1 int, c2 int); CREATE TABLE Insert some sample data. insert into t1 values(1,2); INSERT 0 1 Quit Postgres. quit Quit Sandbox. exit PostgreSQL Properties To set up federation, we need to get the credentials for the PostgreSQL database. Use the following command to get the database password. export POSTGRES_PASSWORD=$(docker exec ibm-lh-postgres printenv | grep POSTGRES_PASSWORD | sed 's/.*=//') echo \"Postgres Userid : admin\" echo \"Postgres Password : \" $POSTGRES_PASSWORD echo $POSTGRES_PASSWORD > /tmp/postgres.pw Open your browser and navigate to: Watsonx.data UI - https://region.techzone-server.com:port Credentials: username: ibmlhadmin password: password Navigate to the Infrastructure manager by clicking on the icon below the Home symbol. You should see a panel like the following. On the top right-hand corner, select Add Component->Add database. The Add database dialog is displayed. Enter the following values: Database type \u2013 PostgreSQL Database name \u2013 pgdatadb Hostname \u2013 ibm-lh-postgres Port \u2013 5432 Display name \u2013 pgdatadb Username \u2013 admin Password \u2013 The value that was extracted in the earlier step Catalog Name \u2013 pgdatadb Your screen should look like the one below. You can press the \"Test\" button to check to see if the connection settings are correct. Once you are satisfied with the settings, press \"Add\". The infrastructure screen should now show the Postgres database. What we are currently missing the connection between the Presto engine and the Postgres data in pgdatadb. We must connect the pgdatadb database to the Presto engine. Use your mouse to hover over the pgdatadb icon until you see the Associate connection icon: Click on the association icon. You should see the following confirmation dialog: Select the presto-01 engine and press Save and restart engine . Press the Associate button and the screen will update to show the connection. Presto Federation First check to make sure that the Presto engine has finished starting. While the watsonx.data UI has restarted the Presto process, it takes a few seconds to become available. check_presto When the command comes back as Ready, you can start using the Presto CLI. Connect to watsonx.data and try Federation. ./presto-cli --catalog pgdatadb Show the current schemas. show schemas; Schema -------------------- pg_catalog public (2 rows) Use the public schema. use public; Select the table we created in Postgres. select * from public.t1; c1 | c2 ----+---- 1 | 2 (1 row) Join with data from other schemas (Sample TPCH+PostgreSQL). select t1.*,customer.name from tpch.tiny.customer, pgdatadb.public.t1 limit 10; c1 | c2 | name ----+----+-------------------- 1 | 2 | Customer#000000001 1 | 2 | Customer#000000002 1 | 2 | Customer#000000003 1 | 2 | Customer#000000004 1 | 2 | Customer#000000005 1 | 2 | Customer#000000006 1 | 2 | Customer#000000007 1 | 2 | Customer#000000008 (10 rows) Quit Presto. quit;","title":"Federation"},{"location":"wxd-federation/#federation-with-watsonxdata","text":"Watsonx.data can federate data from other data sources, there are a few out of box connectors and one could create additional connectors using the SDK if need be (This does involve some programming and testing effort) and not a trivial exercise. We will use the existing PostgreSQL instance, add some data, and test the federation capabilities. Open the developer sandbox and use existing scripts to create a PostgreSQL database and add some data. Switch to the bin directory as the root user. cd /root/ibm-lh-dev/bin Connect to the sandbox. ./dev-sandbox.sh Create the database. /scripts/create_db.sh pgdatadb exists result: CREATE DATABASE Connect to the Database.quit; /scripts/runsql.sh pgdatadb psql (11.19, server 13.4 (Debian 13.4-4.pgdg110+1)) WARNING: psql major version 11, server major version 13. Some psql features might not work. Type \"help\" for help. Create a Table. create table t1( c1 int, c2 int); CREATE TABLE Insert some sample data. insert into t1 values(1,2); INSERT 0 1 Quit Postgres. quit Quit Sandbox. exit","title":"Federation with watsonx.data"},{"location":"wxd-federation/#postgresql-properties","text":"To set up federation, we need to get the credentials for the PostgreSQL database. Use the following command to get the database password. export POSTGRES_PASSWORD=$(docker exec ibm-lh-postgres printenv | grep POSTGRES_PASSWORD | sed 's/.*=//') echo \"Postgres Userid : admin\" echo \"Postgres Password : \" $POSTGRES_PASSWORD echo $POSTGRES_PASSWORD > /tmp/postgres.pw Open your browser and navigate to: Watsonx.data UI - https://region.techzone-server.com:port Credentials: username: ibmlhadmin password: password Navigate to the Infrastructure manager by clicking on the icon below the Home symbol. You should see a panel like the following. On the top right-hand corner, select Add Component->Add database. The Add database dialog is displayed. Enter the following values: Database type \u2013 PostgreSQL Database name \u2013 pgdatadb Hostname \u2013 ibm-lh-postgres Port \u2013 5432 Display name \u2013 pgdatadb Username \u2013 admin Password \u2013 The value that was extracted in the earlier step Catalog Name \u2013 pgdatadb Your screen should look like the one below. You can press the \"Test\" button to check to see if the connection settings are correct. Once you are satisfied with the settings, press \"Add\". The infrastructure screen should now show the Postgres database. What we are currently missing the connection between the Presto engine and the Postgres data in pgdatadb. We must connect the pgdatadb database to the Presto engine. Use your mouse to hover over the pgdatadb icon until you see the Associate connection icon: Click on the association icon. You should see the following confirmation dialog: Select the presto-01 engine and press Save and restart engine . Press the Associate button and the screen will update to show the connection.","title":"PostgreSQL Properties"},{"location":"wxd-federation/#presto-federation","text":"First check to make sure that the Presto engine has finished starting. While the watsonx.data UI has restarted the Presto process, it takes a few seconds to become available. check_presto When the command comes back as Ready, you can start using the Presto CLI. Connect to watsonx.data and try Federation. ./presto-cli --catalog pgdatadb Show the current schemas. show schemas; Schema -------------------- pg_catalog public (2 rows) Use the public schema. use public; Select the table we created in Postgres. select * from public.t1; c1 | c2 ----+---- 1 | 2 (1 row) Join with data from other schemas (Sample TPCH+PostgreSQL). select t1.*,customer.name from tpch.tiny.customer, pgdatadb.public.t1 limit 10; c1 | c2 | name ----+----+-------------------- 1 | 2 | Customer#000000001 1 | 2 | Customer#000000002 1 | 2 | Customer#000000003 1 | 2 | Customer#000000004 1 | 2 | Customer#000000005 1 | 2 | Customer#000000006 1 | 2 | Customer#000000007 1 | 2 | Customer#000000008 (10 rows) Quit Presto. quit;","title":"Presto Federation"},{"location":"wxd-glossary/","text":"Glossary Apache Superset : Apache Superset is an open-source software application for data exploration and data visualization able to handle data at petabyte scale. Apache Superset is a modern, enterprise-ready business intelligence web application. It is fast, lightweight, intuitive, and loaded with options that make it easy for users of all skill sets to explore and visualize their data, from simple pie charts to highly detailed geospatial charts. Application Programming Interface (API)**: Application Programming Interface (API) is a programmatic interface for executing functions of an application in an automated or manual fashion without using a CLI or User Interface. Buckets : Buckets are the basic containers that hold your data. Everything that you store in Cloud Storage must be contained in a bucket. You can use buckets to organize your data and control access to your data, but unlike directories and folders, you cannot nest buckets. Catalog : This term may have many meanings depending on context. Review below: Service Catalog - A service catalog is a comprehensive list of cloud computing services that an organization offers its customers. The catalog is the only portion of the company's service portfolio that is published and provided to customers as a support to the sale or delivery of offered services. Data Catalog - A collection of business information describing the available datasets within an organization. Metastore Catalog - A collection of technical and operational metadata allowing a query engine to overlay a virtual table on a collection of discrete data files. Connector Catalog - The named representation of a connector within the virtual warehouse of a presto instance. Command Line Interface (CLI) : A command-line interface (CLI) is a text-based user interface (UI) used to run programs, manage computer files and interact with the computer. dBeaver : DBeaver is a SQL client software application and a database administration tool. For relational databases it uses the JDBC application programming interface to interact with databases via a JDBC driver. For other databases it uses proprietary database drivers. Federation : A federated database is a system in which several databases appear to function as a single entity. Each component database in the system is completely self-sustained and functional. When an application queries the federated database, the system figures out which of its component databases contains the data being requested and passes the request to it. Federated databases can be thought of as database virtualization in much the same way that storage virtualization makes several drives appear as one. MinIO : MinIO is a high-performance, S3 compatible object store. It is built for large scale AI/ML, data lake and database workloads. It runs on-prem and on any cloud (public or private) and from the data center to the edge. MinIO is software-defined and open source under GNU AGPL v3. Object Storage : Object storage is a data storage architecture for storing unstructured data, which sections data into units\u2014objects\u2014and stores them in a structurally flat data environment. Each object includes the data, metadata, and a unique identifier that applications can use for easy access and retrieval. Presto : Presto is a distributed database query engine (written in Java) that uses the SQL query language. Its architecture allows users to query data sources such as Hadoop, Cassandra, Kafka, AWS S3, Alluxio, MySQL, MongoDB and Teradata, and allows use of multiple data sources within a query. Presto is community-driven open-source software released under the Apache License. Presto's architecture is very similar to other database management systems using cluster computing, sometimes called massively parallel processing (MPP). SPARK : Apache Spark is an open-source unified analytics engine for large-scale data processing. Spark provides an interface for programming clusters with implicit data parallelism and fault tolerance. Spark can be used with watsonx.data but is not included in the watsonx.data environment image provided. TechZone (IBM Technology Zone) : IBM Technology Zone is the platform where the developer edition of watsonx.data with the sample data sets has been provisioned. Generally, it allows Go To Market teams and Business Partners to easily build technical 'Show Me' live environments, POTs, prototypes, and MVPs, which can then be customized and shared with peers and customers to experience IBM Technology. VNC (Virtual Network Computing) : VNC is a cross-platform screen-sharing system that uses the Remote Frame Buffer (RFB) protocol. VNC was created to control another computer remotely. You may know it best for its role in tech support services. Use of VNC is optional. VNC can be used after the WireGuard VPN has been activated to access the watsonx.data server. WireGuard : WireGuard is a communication protocol and free and open-source software that implements encrypted virtual private networks, and was designed with the goals of ease of use, high speed performance, and low attack surface. You will need to install the Wireguard software and download the server VPN certificate in order to access the watsonx.data server.","title":"Glossary"},{"location":"wxd-glossary/#glossary","text":"Apache Superset : Apache Superset is an open-source software application for data exploration and data visualization able to handle data at petabyte scale. Apache Superset is a modern, enterprise-ready business intelligence web application. It is fast, lightweight, intuitive, and loaded with options that make it easy for users of all skill sets to explore and visualize their data, from simple pie charts to highly detailed geospatial charts. Application Programming Interface (API)**: Application Programming Interface (API) is a programmatic interface for executing functions of an application in an automated or manual fashion without using a CLI or User Interface. Buckets : Buckets are the basic containers that hold your data. Everything that you store in Cloud Storage must be contained in a bucket. You can use buckets to organize your data and control access to your data, but unlike directories and folders, you cannot nest buckets. Catalog : This term may have many meanings depending on context. Review below: Service Catalog - A service catalog is a comprehensive list of cloud computing services that an organization offers its customers. The catalog is the only portion of the company's service portfolio that is published and provided to customers as a support to the sale or delivery of offered services. Data Catalog - A collection of business information describing the available datasets within an organization. Metastore Catalog - A collection of technical and operational metadata allowing a query engine to overlay a virtual table on a collection of discrete data files. Connector Catalog - The named representation of a connector within the virtual warehouse of a presto instance. Command Line Interface (CLI) : A command-line interface (CLI) is a text-based user interface (UI) used to run programs, manage computer files and interact with the computer. dBeaver : DBeaver is a SQL client software application and a database administration tool. For relational databases it uses the JDBC application programming interface to interact with databases via a JDBC driver. For other databases it uses proprietary database drivers. Federation : A federated database is a system in which several databases appear to function as a single entity. Each component database in the system is completely self-sustained and functional. When an application queries the federated database, the system figures out which of its component databases contains the data being requested and passes the request to it. Federated databases can be thought of as database virtualization in much the same way that storage virtualization makes several drives appear as one. MinIO : MinIO is a high-performance, S3 compatible object store. It is built for large scale AI/ML, data lake and database workloads. It runs on-prem and on any cloud (public or private) and from the data center to the edge. MinIO is software-defined and open source under GNU AGPL v3. Object Storage : Object storage is a data storage architecture for storing unstructured data, which sections data into units\u2014objects\u2014and stores them in a structurally flat data environment. Each object includes the data, metadata, and a unique identifier that applications can use for easy access and retrieval. Presto : Presto is a distributed database query engine (written in Java) that uses the SQL query language. Its architecture allows users to query data sources such as Hadoop, Cassandra, Kafka, AWS S3, Alluxio, MySQL, MongoDB and Teradata, and allows use of multiple data sources within a query. Presto is community-driven open-source software released under the Apache License. Presto's architecture is very similar to other database management systems using cluster computing, sometimes called massively parallel processing (MPP). SPARK : Apache Spark is an open-source unified analytics engine for large-scale data processing. Spark provides an interface for programming clusters with implicit data parallelism and fault tolerance. Spark can be used with watsonx.data but is not included in the watsonx.data environment image provided. TechZone (IBM Technology Zone) : IBM Technology Zone is the platform where the developer edition of watsonx.data with the sample data sets has been provisioned. Generally, it allows Go To Market teams and Business Partners to easily build technical 'Show Me' live environments, POTs, prototypes, and MVPs, which can then be customized and shared with peers and customers to experience IBM Technology. VNC (Virtual Network Computing) : VNC is a cross-platform screen-sharing system that uses the Remote Frame Buffer (RFB) protocol. VNC was created to control another computer remotely. You may know it best for its role in tech support services. Use of VNC is optional. VNC can be used after the WireGuard VPN has been activated to access the watsonx.data server. WireGuard : WireGuard is a communication protocol and free and open-source software that implements encrypted virtual private networks, and was designed with the goals of ease of use, high speed performance, and low attack surface. You will need to install the Wireguard software and download the server VPN certificate in order to access the watsonx.data server.","title":"Glossary"},{"location":"wxd-ingest/","text":"Ingesting Data In this lab we will use the ingest tool (lh-tool) alongside the IBM watsonx.data developer edition that is running in this lab. The Ingest tool is a separate install and currently needs to be downloaded after IBM watsonx.data is started. The lab image contains a copy of this code, so you will not need to download it. In addition, there is a staging file (yellowtaxi-parquet) found in the sample data directory that will be used for loading data into the system. As the root user, switch to the client bin directory. cd /root/ibm-lh-client/bin Ingest data into the IBM watsonx.data Before running the utility, we need to retrieve several credentials for MinIO and the keystore password. export LH_S3_ACCESS_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_ACCESS_KEY | sed 's/.*=//') export LH_S3_SECRET_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_SECRET_KEY | sed 's/.*=//') export LH_KEYSTORE_PASSWORD=$(docker exec ibm-lh-presto printenv | grep LH_KEYSTORE_PASSWORD | sed 's/.*=//') We need to generate three export lines and create a directory that will be used later in another script. The staging directory is used to move files between this system and the docker container that the ibm-lh tool is running in. export staging=/root/ibm-lh-client/localstorage/volumes/infra/staging mkdir -p ${staging} cat <<EOF > ${staging}/keys.sh #!/bin/bash export access_key=$LH_S3_ACCESS_KEY export secret_key=$LH_S3_SECRET_KEY export keystore_password=$LH_KEYSTORE_PASSWORD EOF chmod +x ${staging}/keys.sh A copy of the MinIO SSL certificate needs to be copied from the docker container. In addition, the file that we want loaded into watsonx.data will be moved into the staging file directory. docker cp ibm-lh-presto:/mnt/infra/tls/lh-ssl-ts.jks ${staging}/lh-ssl-ts.jks \\cp -f /sampledata/yellow_tripdata_2022-01.parquet ${staging}/. Create a hive schema for staging the ingest file Before ingesting the file, we need to create a new schema that we will use for the table being loaded. Your TechZone reservation will include the server name and port number to connect to the watsonx.data UI. In the watsonx.data UI select the Data Explorer. You should see a screen like the following. Use the \"Create\" pulldown and select Create schema in the hive_data catalog. Select the hive_data catalog and use staging as the new schema name. Press the Create button to finish the creation of the schema. You should see the new staging schema under hive_data . You need to repeat the same process again, but this time you are going to add a schema called ingest in the iceberg_data catalog. You should see the new ingest schema in the navigator screen. Start the IBM tools Container To access the tools container, we issue the ibm-lh command with the data-load option. ./ibm-lh data-copy /////////////////////////////////////// /////////////////////////////////////// _ _ _ _ | |__ _, ,_ | || |_ _ | || '_ \\ / /\\//| |_ _| || |_ | | || |_) || | | |_ _| || | | | |_||_.__/ |_| |_| |_||_| |_| /////////////////////////////////////// /////////////////////////////////////// Once inside the utility, you can use the following command to get more details on the data-copy option. ibm-lh data-copy --help Exit from the tools container by using the exit command. exit The following script is used to ingest the taxi data (yellow_tripdata_2022_01) into watsonx.data. Choose the script which corresponds to the release of watsonx.data that you are currently running. Watsonx.data Version 1.0.3 cat <<EOF > ${staging}/ingest-local.sh #!/bin/bash dir=/mnt/infra/staging cd \\${dir} source ./keys.sh table_name=\"iceberg_data.ingest.yellow_tripdata_2022_01_localfile\" file=\"yellow_tripdata_2022-01.parquet\" ibm-lh data-copy \\\\ --source-data-files \\${dir}/\\${file} \\\\ --target-tables \\${table_name} \\\\ --ingestion-engine-endpoint \"hostname=ibm-lh-presto-svc,port=8443\" \\\\ --staging-location s3://iceberg-bucket/ingest/ \\ --staging-hive-catalog hive_data \\ --staging-hive-schema staging \\ --staging-s3-creds \\\\ \"AWS_SECRET_ACCESS_KEY=\\${secret_key}\\\\ ,AWS_ACCESS_KEY_ID=\\${access_key}\\\\ ,AWS_REGION=us-east-1\\\\ ,BUCKET_NAME=iceberg-bucket\\\\ ,ENDPOINT_URL=http://ibm-lh-minio:9000\" \\\\ --create-if-not-exist \\\\ --trust-store-path \\${dir}/lh-ssl-ts.jks \\\\ --trust-store-password \\${keystore_password} \\\\ --dbuser ibmlhadmin \\\\ --dbpassword password EOF sed -i '/^$/d' ${staging}/ingest-local.sh chmod +x ${staging}/ingest-local.sh Watsonx.data Version 1.1.0 In version 1.1.0, the --staging-s3-creds are replaced with an environment variable called STAGING_S3_CREDS . cat <<EOF > ${staging}/ingest-local.sh #!/bin/bash dir=/mnt/infra/staging cd \\${dir} source ./keys.sh table_name=\"iceberg_data.ingest.yellow_tripdata_2022_01_localfile\" file=\"yellow_tripdata_2022-01.parquet\" export STAGING_S3_CREDS=\"AWS_SECRET_ACCESS_KEY=\\${secret_key}\\\\ ,AWS_ACCESS_KEY_ID=\\${access_key}\\\\ ,AWS_REGION=us-east-1\\\\ ,BUCKET_NAME=iceberg-bucket\\\\ ,ENDPOINT_URL=http://ibm-lh-minio:9000\" ibm-lh data-copy \\\\ --source-data-files \\${dir}/\\${file} \\\\ --target-tables \\${table_name} \\\\ --ingestion-engine-endpoint \"hostname=ibm-lh-presto-svc,port=8443\" \\\\ --staging-location s3://iceberg-bucket/ingest/ \\\\ --staging-hive-catalog hive_data \\\\ --staging-hive-schema staging \\\\ --create-if-not-exist \\\\ --trust-store-path \\${dir}/lh-ssl-ts.jks \\\\ --trust-store-password \\${keystore_password} \\\\ --dbuser ibmlhadmin \\\\ --dbpassword password EOF sed -i '/^$/d' ${staging}/ingest-local.sh chmod +x ${staging}/ingest-local.sh Start the Ingest Process Start the ibm-lh container again: ./ibm-lh data-copy Now run the ingest job inside the tool container. /mnt/infra/staging/ingest-local.sh Start data migration Ingesting SECTION: cmdline Reading parquet file:/staging/yellow_tripdata_2022-01.parquet Inferring source schema... Schema inferred Ingesting source folder s3://dev-bucket-01/ingest/stage_1686085369_19_ea7fa9994c96/ into target table ingest.yellow_tripdata_2022_01_localfile The specified table does not exist Target table does not exist.. creating Current State: RUNNING Rows Ingested: 408575 Current State: RUNNING Rows Ingested: 52 Current State: 100% FINISHED Done ingesting into table: ingest.yellow_tripdata_2022_01_localfile Complete migration After ingesting the data, exit the docker container. exit Refresh the IBM watsonx.data UI to view the iceberg_data catalog in the Data Explorer. Click on the yellow_tripdata table to see the schema definition. Then click on the Data sample tab to see a snippet of the data. Now we can use the UI to run a query against this imported data. Select the SQL icon on the left side of the display. On the line where the yellow_tripdate table is located, click the icon at the end of the name. This will display a drop-down list. Select \"Generate SELECT\". This will generate a SQL statement in the window to the right of the table name. Now execute the query to see what the results are. That completes the labs! Congratulations you are done!","title":"Ingesting Data"},{"location":"wxd-ingest/#ingesting-data","text":"In this lab we will use the ingest tool (lh-tool) alongside the IBM watsonx.data developer edition that is running in this lab. The Ingest tool is a separate install and currently needs to be downloaded after IBM watsonx.data is started. The lab image contains a copy of this code, so you will not need to download it. In addition, there is a staging file (yellowtaxi-parquet) found in the sample data directory that will be used for loading data into the system. As the root user, switch to the client bin directory. cd /root/ibm-lh-client/bin","title":"Ingesting Data"},{"location":"wxd-ingest/#ingest-data-into-the-ibm-watsonxdata","text":"Before running the utility, we need to retrieve several credentials for MinIO and the keystore password. export LH_S3_ACCESS_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_ACCESS_KEY | sed 's/.*=//') export LH_S3_SECRET_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_SECRET_KEY | sed 's/.*=//') export LH_KEYSTORE_PASSWORD=$(docker exec ibm-lh-presto printenv | grep LH_KEYSTORE_PASSWORD | sed 's/.*=//') We need to generate three export lines and create a directory that will be used later in another script. The staging directory is used to move files between this system and the docker container that the ibm-lh tool is running in. export staging=/root/ibm-lh-client/localstorage/volumes/infra/staging mkdir -p ${staging} cat <<EOF > ${staging}/keys.sh #!/bin/bash export access_key=$LH_S3_ACCESS_KEY export secret_key=$LH_S3_SECRET_KEY export keystore_password=$LH_KEYSTORE_PASSWORD EOF chmod +x ${staging}/keys.sh A copy of the MinIO SSL certificate needs to be copied from the docker container. In addition, the file that we want loaded into watsonx.data will be moved into the staging file directory. docker cp ibm-lh-presto:/mnt/infra/tls/lh-ssl-ts.jks ${staging}/lh-ssl-ts.jks \\cp -f /sampledata/yellow_tripdata_2022-01.parquet ${staging}/.","title":"Ingest data into the IBM watsonx.data"},{"location":"wxd-ingest/#create-a-hive-schema-for-staging-the-ingest-file","text":"Before ingesting the file, we need to create a new schema that we will use for the table being loaded. Your TechZone reservation will include the server name and port number to connect to the watsonx.data UI. In the watsonx.data UI select the Data Explorer. You should see a screen like the following. Use the \"Create\" pulldown and select Create schema in the hive_data catalog. Select the hive_data catalog and use staging as the new schema name. Press the Create button to finish the creation of the schema. You should see the new staging schema under hive_data . You need to repeat the same process again, but this time you are going to add a schema called ingest in the iceberg_data catalog. You should see the new ingest schema in the navigator screen.","title":"Create a hive schema for staging the ingest file"},{"location":"wxd-ingest/#start-the-ibm-tools-container","text":"To access the tools container, we issue the ibm-lh command with the data-load option. ./ibm-lh data-copy /////////////////////////////////////// /////////////////////////////////////// _ _ _ _ | |__ _, ,_ | || |_ _ | || '_ \\ / /\\//| |_ _| || |_ | | || |_) || | | |_ _| || | | | |_||_.__/ |_| |_| |_||_| |_| /////////////////////////////////////// /////////////////////////////////////// Once inside the utility, you can use the following command to get more details on the data-copy option. ibm-lh data-copy --help Exit from the tools container by using the exit command. exit The following script is used to ingest the taxi data (yellow_tripdata_2022_01) into watsonx.data. Choose the script which corresponds to the release of watsonx.data that you are currently running.","title":"Start the IBM tools Container"},{"location":"wxd-ingest/#watsonxdata-version-103","text":"cat <<EOF > ${staging}/ingest-local.sh #!/bin/bash dir=/mnt/infra/staging cd \\${dir} source ./keys.sh table_name=\"iceberg_data.ingest.yellow_tripdata_2022_01_localfile\" file=\"yellow_tripdata_2022-01.parquet\" ibm-lh data-copy \\\\ --source-data-files \\${dir}/\\${file} \\\\ --target-tables \\${table_name} \\\\ --ingestion-engine-endpoint \"hostname=ibm-lh-presto-svc,port=8443\" \\\\ --staging-location s3://iceberg-bucket/ingest/ \\ --staging-hive-catalog hive_data \\ --staging-hive-schema staging \\ --staging-s3-creds \\\\ \"AWS_SECRET_ACCESS_KEY=\\${secret_key}\\\\ ,AWS_ACCESS_KEY_ID=\\${access_key}\\\\ ,AWS_REGION=us-east-1\\\\ ,BUCKET_NAME=iceberg-bucket\\\\ ,ENDPOINT_URL=http://ibm-lh-minio:9000\" \\\\ --create-if-not-exist \\\\ --trust-store-path \\${dir}/lh-ssl-ts.jks \\\\ --trust-store-password \\${keystore_password} \\\\ --dbuser ibmlhadmin \\\\ --dbpassword password EOF sed -i '/^$/d' ${staging}/ingest-local.sh chmod +x ${staging}/ingest-local.sh","title":"Watsonx.data Version 1.0.3"},{"location":"wxd-ingest/#watsonxdata-version-110","text":"In version 1.1.0, the --staging-s3-creds are replaced with an environment variable called STAGING_S3_CREDS . cat <<EOF > ${staging}/ingest-local.sh #!/bin/bash dir=/mnt/infra/staging cd \\${dir} source ./keys.sh table_name=\"iceberg_data.ingest.yellow_tripdata_2022_01_localfile\" file=\"yellow_tripdata_2022-01.parquet\" export STAGING_S3_CREDS=\"AWS_SECRET_ACCESS_KEY=\\${secret_key}\\\\ ,AWS_ACCESS_KEY_ID=\\${access_key}\\\\ ,AWS_REGION=us-east-1\\\\ ,BUCKET_NAME=iceberg-bucket\\\\ ,ENDPOINT_URL=http://ibm-lh-minio:9000\" ibm-lh data-copy \\\\ --source-data-files \\${dir}/\\${file} \\\\ --target-tables \\${table_name} \\\\ --ingestion-engine-endpoint \"hostname=ibm-lh-presto-svc,port=8443\" \\\\ --staging-location s3://iceberg-bucket/ingest/ \\\\ --staging-hive-catalog hive_data \\\\ --staging-hive-schema staging \\\\ --create-if-not-exist \\\\ --trust-store-path \\${dir}/lh-ssl-ts.jks \\\\ --trust-store-password \\${keystore_password} \\\\ --dbuser ibmlhadmin \\\\ --dbpassword password EOF sed -i '/^$/d' ${staging}/ingest-local.sh chmod +x ${staging}/ingest-local.sh","title":"Watsonx.data Version 1.1.0"},{"location":"wxd-ingest/#start-the-ingest-process","text":"Start the ibm-lh container again: ./ibm-lh data-copy Now run the ingest job inside the tool container. /mnt/infra/staging/ingest-local.sh Start data migration Ingesting SECTION: cmdline Reading parquet file:/staging/yellow_tripdata_2022-01.parquet Inferring source schema... Schema inferred Ingesting source folder s3://dev-bucket-01/ingest/stage_1686085369_19_ea7fa9994c96/ into target table ingest.yellow_tripdata_2022_01_localfile The specified table does not exist Target table does not exist.. creating Current State: RUNNING Rows Ingested: 408575 Current State: RUNNING Rows Ingested: 52 Current State: 100% FINISHED Done ingesting into table: ingest.yellow_tripdata_2022_01_localfile Complete migration After ingesting the data, exit the docker container. exit Refresh the IBM watsonx.data UI to view the iceberg_data catalog in the Data Explorer. Click on the yellow_tripdata table to see the schema definition. Then click on the Data sample tab to see a snippet of the data. Now we can use the UI to run a query against this imported data. Select the SQL icon on the left side of the display. On the line where the yellow_tripdate table is located, click the icon at the end of the name. This will display a drop-down list. Select \"Generate SELECT\". This will generate a SQL statement in the window to the right of the table name. Now execute the query to see what the results are. That completes the labs! Congratulations you are done!","title":"Start the Ingest Process"},{"location":"wxd-intro/","text":"Introducing watsonx.data The next-gen watsonx.data lakehouse is designed to overcome the costs and complexities enterprises face. This will be the world\u2019s first and only open data store with multi-engine support that is built for hybrid deployment across your entire ecosystem. WatsonX.data is the only lakehouse with multiple query engines allowing you to optimize costs and performance by pairing the right workload with the right engine. Run all workloads from a single pane of glass, eliminating trade-offs with convenience while still improving cost and performance. Deploy anywhere with full support for hybrid-cloud and multi cloud environments. Shared metadata across multiple engines eliminates the need to re-catalog, accelerating time to value while ensuring governance and eliminating costly implementation efforts. This lab uses the watsonx.data developer package. The Developer package is meant to be used on single nodes. While it uses the same code base, there are some restrictions, especially on scale. In this lab, we will open some additional ports as well to understand how everything works. We will also use additional utilities to illustrate connectivity and what makes the watsonx.data system \"open\". We organized this lab into a number of sections that cover many of the highlights and key features of watsonx.data. Access a TechZone or VMWare image for testing Checking watsonx.data status Introduction to watsonx.data components Analytical SQL Advanced SQL functions Time Travel and Federation Working with Object Store Buckets In addition, there is an Appendix which includes common errors and potential fixes or workarounds. Watsonx.data Developer Image The watsonx.data system is running on a virtual machine with the following resources: 4 vCPUs 16Gb of memory 400Gb of disk This is sufficient for running this exercises found in this lab but should not be used for performance testing or dealing with large data sets. Watsonx.data Level 3 Technical Training This system is used as a basis for the watsonx.data Level 3 Technical Training. For the detailed lab material, please refer to the following PDF found in Seismic: https://ibm.seismic.com/Link/Content/DCG37pjmPj7VmGCHj2Df8fHVmDJj","title":"Introduction"},{"location":"wxd-intro/#introducing-watsonxdata","text":"The next-gen watsonx.data lakehouse is designed to overcome the costs and complexities enterprises face. This will be the world\u2019s first and only open data store with multi-engine support that is built for hybrid deployment across your entire ecosystem. WatsonX.data is the only lakehouse with multiple query engines allowing you to optimize costs and performance by pairing the right workload with the right engine. Run all workloads from a single pane of glass, eliminating trade-offs with convenience while still improving cost and performance. Deploy anywhere with full support for hybrid-cloud and multi cloud environments. Shared metadata across multiple engines eliminates the need to re-catalog, accelerating time to value while ensuring governance and eliminating costly implementation efforts. This lab uses the watsonx.data developer package. The Developer package is meant to be used on single nodes. While it uses the same code base, there are some restrictions, especially on scale. In this lab, we will open some additional ports as well to understand how everything works. We will also use additional utilities to illustrate connectivity and what makes the watsonx.data system \"open\". We organized this lab into a number of sections that cover many of the highlights and key features of watsonx.data. Access a TechZone or VMWare image for testing Checking watsonx.data status Introduction to watsonx.data components Analytical SQL Advanced SQL functions Time Travel and Federation Working with Object Store Buckets In addition, there is an Appendix which includes common errors and potential fixes or workarounds.","title":"Introducing watsonx.data"},{"location":"wxd-intro/#watsonxdata-developer-image","text":"The watsonx.data system is running on a virtual machine with the following resources: 4 vCPUs 16Gb of memory 400Gb of disk This is sufficient for running this exercises found in this lab but should not be used for performance testing or dealing with large data sets.","title":"Watsonx.data Developer Image"},{"location":"wxd-intro/#watsonxdata-level-3-technical-training","text":"This system is used as a basis for the watsonx.data Level 3 Technical Training. For the detailed lab material, please refer to the following PDF found in Seismic: https://ibm.seismic.com/Link/Content/DCG37pjmPj7VmGCHj2Df8fHVmDJj","title":"Watsonx.data Level 3 Technical Training"},{"location":"wxd-jupyter/","text":"Jupyter Notebook The watsonx.data server includes the Jupyter Notebook service which provides an interactive way of exploring the features of the Presto database. The link to the Jupyter Notebook table of contents is provided in your TechZone reservation. When you initially open the link, it will request a password to view the Table of Contents: The default password for the notebook is watsonx.data . Once you enter the password, the Table of Contents will be displayed. There are 11 notebooks provided in the system, and a brief description of the notebooks are found below. Introduction to Jupyter Notebooks If you are not familiar with the use of Jupyter notebooks, this will be a good starting point. To view the notebook, click on the blue arrow found at the bottom of the box. This will open a new tab in your browser with the contents of the notebook. This notebook provides an introduction to what Jupyter Notebooks are and what the common tasks are that you can perform in a notebook. Watsonx.data Credentials This is a key notebook for you to use during your work with the watsonx.data system. This notebook provides details on the userids and passwords for the services that are running in the server. There is no need to use a terminal command line to determine what the credentials are! In addition to the userids and passwords, this notebook provides a convenient way of downloading the certificate required to connect to the Presto database. Simply click on the certificate link and it will be downloaded to your local machine. Presto Magic Commands Magic commands are special macros found in Jupyter notebooks that simplify many tasks, including the ability to run SQL commands against a database. This notebook provides an introduction to what magic commands are and how you can use the Presto magic commands to connect and query the Presto database. Introduction to Presto SQL The watsonx.data lab has two ways of running SQL against the Presto database: Presto CLI commands Python/Pandas/Magic commands This notebook contains all the SQL that is run in the Presto SQL section of the lab. Instead of using the presto-cli command, this notebook uses magic commands to simplify the SQL execution. You can choose either method to explore Presto SQL. Presto Federation Presto provides the ability to federate queries across different servers. This notebook explores the ability to federate a PostgreSQL table with a table found in Presto. This lab requires some knowledge of the watsonx.data UI, so it is recommended you become familiar with the UI before running this lab. Python with watsonx.data Accessing the Presto database in Python requires the use of the prestodb module which implements features of the DBAPI standard. The notebook demonstrates how to connect to the database and retrieve results. Pandas Dataframes with watsonx.data Pandas dataframes are commonly used in Jupyter notebooks to analyze data. This code will connect to Presto using a Pandas dataframe and display some data from an existing table that was created in Presto. Note that the certificate required for this notebook is provided in the environment. Accessing watsonx.data with Spark This notebook demonstrates how Spark can connect to watsonx.data and manipulate the data. This system has a local, minimally configured Spark engine that will be used to access the Presto database. This engine is sufficient to demonstrate the steps needed to connect to watsonx.data and access the data that resides in the Presto catalogs. Connecting to Db2 This notebook demonstrates connecting to the local Db2 server using Jupyter notebooks. Connecting to PostgreSQL This notebook demonstrates connecting to the local PostgreSQL server using Jupyter notebooks. Connecting to MySQL This notebook demonstrates connecting to the local MySQL server using Jupyter notebooks.","title":"Jupyter Notebook"},{"location":"wxd-jupyter/#jupyter-notebook","text":"The watsonx.data server includes the Jupyter Notebook service which provides an interactive way of exploring the features of the Presto database. The link to the Jupyter Notebook table of contents is provided in your TechZone reservation. When you initially open the link, it will request a password to view the Table of Contents: The default password for the notebook is watsonx.data . Once you enter the password, the Table of Contents will be displayed. There are 11 notebooks provided in the system, and a brief description of the notebooks are found below.","title":"Jupyter Notebook"},{"location":"wxd-jupyter/#introduction-to-jupyter-notebooks","text":"If you are not familiar with the use of Jupyter notebooks, this will be a good starting point. To view the notebook, click on the blue arrow found at the bottom of the box. This will open a new tab in your browser with the contents of the notebook. This notebook provides an introduction to what Jupyter Notebooks are and what the common tasks are that you can perform in a notebook.","title":"Introduction to Jupyter Notebooks"},{"location":"wxd-jupyter/#watsonxdata-credentials","text":"This is a key notebook for you to use during your work with the watsonx.data system. This notebook provides details on the userids and passwords for the services that are running in the server. There is no need to use a terminal command line to determine what the credentials are! In addition to the userids and passwords, this notebook provides a convenient way of downloading the certificate required to connect to the Presto database. Simply click on the certificate link and it will be downloaded to your local machine.","title":"Watsonx.data Credentials"},{"location":"wxd-jupyter/#presto-magic-commands","text":"Magic commands are special macros found in Jupyter notebooks that simplify many tasks, including the ability to run SQL commands against a database. This notebook provides an introduction to what magic commands are and how you can use the Presto magic commands to connect and query the Presto database.","title":"Presto Magic Commands"},{"location":"wxd-jupyter/#introduction-to-presto-sql","text":"The watsonx.data lab has two ways of running SQL against the Presto database: Presto CLI commands Python/Pandas/Magic commands This notebook contains all the SQL that is run in the Presto SQL section of the lab. Instead of using the presto-cli command, this notebook uses magic commands to simplify the SQL execution. You can choose either method to explore Presto SQL.","title":"Introduction to Presto SQL"},{"location":"wxd-jupyter/#presto-federation","text":"Presto provides the ability to federate queries across different servers. This notebook explores the ability to federate a PostgreSQL table with a table found in Presto. This lab requires some knowledge of the watsonx.data UI, so it is recommended you become familiar with the UI before running this lab.","title":"Presto Federation"},{"location":"wxd-jupyter/#python-with-watsonxdata","text":"Accessing the Presto database in Python requires the use of the prestodb module which implements features of the DBAPI standard. The notebook demonstrates how to connect to the database and retrieve results.","title":"Python with watsonx.data"},{"location":"wxd-jupyter/#pandas-dataframes-with-watsonxdata","text":"Pandas dataframes are commonly used in Jupyter notebooks to analyze data. This code will connect to Presto using a Pandas dataframe and display some data from an existing table that was created in Presto. Note that the certificate required for this notebook is provided in the environment.","title":"Pandas Dataframes with watsonx.data"},{"location":"wxd-jupyter/#accessing-watsonxdata-with-spark","text":"This notebook demonstrates how Spark can connect to watsonx.data and manipulate the data. This system has a local, minimally configured Spark engine that will be used to access the Presto database. This engine is sufficient to demonstrate the steps needed to connect to watsonx.data and access the data that resides in the Presto catalogs.","title":"Accessing watsonx.data with Spark"},{"location":"wxd-jupyter/#connecting-to-db2","text":"This notebook demonstrates connecting to the local Db2 server using Jupyter notebooks.","title":"Connecting to Db2"},{"location":"wxd-jupyter/#connecting-to-postgresql","text":"This notebook demonstrates connecting to the local PostgreSQL server using Jupyter notebooks.","title":"Connecting to PostgreSQL"},{"location":"wxd-jupyter/#connecting-to-mysql","text":"This notebook demonstrates connecting to the local MySQL server using Jupyter notebooks.","title":"Connecting to MySQL"},{"location":"wxd-lab-instructions/","text":"Lab Instructions URL Conventions Your TechZone reservation contains a number of URLs for the services provided in the watsonx.data server. The URL will contain the name of the server and the corresponding port number for the service. Throughout the documentation, the server name will be referred to as region.techzone-server.com and port number is referred to as port . Where you see these URLs, replace them with the values found in your reservation. Commands Throughout the labs, any command that needs to be executed will be highlighted in a grey box: cd /root/ibm-lh-dev/bin A copy icon is usually found on the far right-hand side of the command box. Use this to copy the text and paste it into your command window. You can also select the text and copy it that way. Notes : Some commands may span multiple lines, so make sure you copy everything in the box if you are not using the copy button Commands pasted into a terminal window will require that you hit the Return or Enter key for the command to be executed Commands pasted into a Presto CLI window will execute automatically System Check The watsonx.data server automatically starts all services except for Apache Superset and the VNC service. To check the status of the server, run the following commands. Make sure that you have an open terminal session and use the following command to connect to the watsonx.data server. ssh -p port watsonx@region.techzone-server.com Password is watsonx.data . Next switch to the root userid. sudo su - Switch to the development code bin directory. cd /root/ibm-lh-dev/bin Once you have switched to the development directory, you can start running watsonx.data commands. You can check the status with the following command. ./status.sh --all Output will look similar to: using /root/ibm-lh-dev/localstorage/volumes as data root directory for user: root/1001 infra config location is /root/ibm-lh-dev/localstorage/volumes/infra lhconsole-ui running 0.0.0.0:9443->8443/tcp, :::9443->8443/tcp lhconsole-nodeclient-svc running 3001/tcp lhconsole-javaapi-svc running 8090/tcp lhconsole-api running 3333/tcp, 8081/tcp ibm-lh-presto running 0.0.0.0:8443->8443/tcp, :::8443->8443/tcp ibm-lh-hive-metastore running ibm-lh-postgres running 5432/tcp ibm-lh-minio running To confirm that the software is working, run the following commands to validate the installation. Presto Engine Test Check the Presto engine by connecting to a schema. First, we need to make sure that the Presto engine has completed all startup tasks. The following command is not part of watsonx.data, but has been included to simplify checking the status of the Presto service. check_presto Waiting for Presto to start. ........................... Ready Note : If the starting message may take up to 5 minutes when the system first initializes. Once the command returns \"Ready\" you can connect to the presto CLI. ./presto-cli --catalog tpch --schema tiny Note : If the Presto engine has not yet started (you didn't run the check_presto script), the next command may result in a useless Java error message. You may need to wait for a minute for attempting to run the statement again. Check the record count of the customer table. select * from customer limit 10; All Presto commands end with a semi-colon. The result set should include the a number of rows (the results will be random). custkey | name | address | nationkey | phone | acctbal | mktsegment | comment ---------+--------------------+---------------------------------------+-----------+-----------------+---------+------------+------------------------------------------------------------------------------------------------------------------- 1 | Customer#000000001 | IVhzIApeRb ot,c,E | 15 | 25-989-741-2988 | 711.56 | BUILDING | to the even, regular platelets. regular, ironic epitaphs nag e 2 | Customer#000000002 | XSTf4,NCwDVaWNe6tEgvwfmRchLXak | 13 | 23-768-687-3665 | 121.65 | AUTOMOBILE | l accounts. blithely ironic theodolites integrate boldly: caref 3 | Customer#000000003 | MG9kdTD2WBHm | 1 | 11-719-748-3364 | 7498.12 | AUTOMOBILE | deposits eat slyly ironic, even instructions. express foxes detect slyly. blithely even accounts abov 4 | Customer#000000004 | XxVSJsLAGtn | 4 | 14-128-190-5944 | 2866.83 | MACHINERY | requests. final, regular ideas sleep final accou 5 | Customer#000000005 | KvpyuHCplrB84WgAiGV6sYpZq7Tj | 3 | 13-750-942-6364 | 794.47 | HOUSEHOLD | n accounts will have to unwind. foxes cajole accor 6 | Customer#000000006 | sKZz0CsnMD7mp4Xd0YrBvx,LREYKUWAh yVn | 20 | 30-114-968-4951 | 7638.57 | AUTOMOBILE | tions. even deposits boost according to the slyly bold packages. final accounts cajole requests. furious 7 | Customer#000000007 | TcGe5gaZNgVePxU5kRrvXBfkasDTea | 18 | 28-190-982-9759 | 9561.95 | AUTOMOBILE | ainst the ironic, express theodolites. express, even pinto beans among the exp 8 | Customer#000000008 | I0B10bB0AymmC, 0PrRYBCP1yGJ8xcBPmWhl5 | 17 | 27-147-574-9335 | 6819.74 | BUILDING | among the slyly regular theodolites kindle blithely courts. carefully even theodolites haggle slyly along the ide 9 | Customer#000000009 | xKiAFTjUsCuxfeleNqefumTrjS | 8 | 18-338-906-3675 | 8324.07 | FURNITURE | r theodolites according to the requests wake thinly excuses: pending requests haggle furiousl 10 | Customer#000000010 | 6LrEaV6KR6PLVcgl2ArL Q3rqzLzcT1 v2 | 5 | 15-741-346-9870 | 2753.54 | HOUSEHOLD | es regular deposits haggle. fur (10 rows) The output on your screen will look similar to the following: The arrows on the far right side indicate that there is more output to view. Press the right and left arrows on your keyboard to scroll the display. If the result set is small, all of the results will display on the screen and no scrolling will be available unless the results are wider than the screen size. When thje display shows (END) you have reached the bottom of the output. If the display shows a colon ( : ) at the bottom of the screen, you can use the up and down arrow keys to scroll a record at a time, or the Page Up and Page Down keys to scroll a page at a time. To quit viewing the output, press the Q key. Quit the Presto CLI. The Presto quit command can be used with or without a semicolon. quit; Congratulations, your system is now up and running!","title":"Lab Instructions"},{"location":"wxd-lab-instructions/#lab-instructions","text":"","title":"Lab Instructions"},{"location":"wxd-lab-instructions/#url-conventions","text":"Your TechZone reservation contains a number of URLs for the services provided in the watsonx.data server. The URL will contain the name of the server and the corresponding port number for the service. Throughout the documentation, the server name will be referred to as region.techzone-server.com and port number is referred to as port . Where you see these URLs, replace them with the values found in your reservation.","title":"URL Conventions"},{"location":"wxd-lab-instructions/#commands","text":"Throughout the labs, any command that needs to be executed will be highlighted in a grey box: cd /root/ibm-lh-dev/bin A copy icon is usually found on the far right-hand side of the command box. Use this to copy the text and paste it into your command window. You can also select the text and copy it that way. Notes : Some commands may span multiple lines, so make sure you copy everything in the box if you are not using the copy button Commands pasted into a terminal window will require that you hit the Return or Enter key for the command to be executed Commands pasted into a Presto CLI window will execute automatically","title":"Commands"},{"location":"wxd-lab-instructions/#system-check","text":"The watsonx.data server automatically starts all services except for Apache Superset and the VNC service. To check the status of the server, run the following commands. Make sure that you have an open terminal session and use the following command to connect to the watsonx.data server. ssh -p port watsonx@region.techzone-server.com Password is watsonx.data . Next switch to the root userid. sudo su - Switch to the development code bin directory. cd /root/ibm-lh-dev/bin Once you have switched to the development directory, you can start running watsonx.data commands. You can check the status with the following command. ./status.sh --all Output will look similar to: using /root/ibm-lh-dev/localstorage/volumes as data root directory for user: root/1001 infra config location is /root/ibm-lh-dev/localstorage/volumes/infra lhconsole-ui running 0.0.0.0:9443->8443/tcp, :::9443->8443/tcp lhconsole-nodeclient-svc running 3001/tcp lhconsole-javaapi-svc running 8090/tcp lhconsole-api running 3333/tcp, 8081/tcp ibm-lh-presto running 0.0.0.0:8443->8443/tcp, :::8443->8443/tcp ibm-lh-hive-metastore running ibm-lh-postgres running 5432/tcp ibm-lh-minio running To confirm that the software is working, run the following commands to validate the installation.","title":"System Check"},{"location":"wxd-lab-instructions/#presto-engine-test","text":"Check the Presto engine by connecting to a schema. First, we need to make sure that the Presto engine has completed all startup tasks. The following command is not part of watsonx.data, but has been included to simplify checking the status of the Presto service. check_presto Waiting for Presto to start. ........................... Ready Note : If the starting message may take up to 5 minutes when the system first initializes. Once the command returns \"Ready\" you can connect to the presto CLI. ./presto-cli --catalog tpch --schema tiny Note : If the Presto engine has not yet started (you didn't run the check_presto script), the next command may result in a useless Java error message. You may need to wait for a minute for attempting to run the statement again. Check the record count of the customer table. select * from customer limit 10; All Presto commands end with a semi-colon. The result set should include the a number of rows (the results will be random). custkey | name | address | nationkey | phone | acctbal | mktsegment | comment ---------+--------------------+---------------------------------------+-----------+-----------------+---------+------------+------------------------------------------------------------------------------------------------------------------- 1 | Customer#000000001 | IVhzIApeRb ot,c,E | 15 | 25-989-741-2988 | 711.56 | BUILDING | to the even, regular platelets. regular, ironic epitaphs nag e 2 | Customer#000000002 | XSTf4,NCwDVaWNe6tEgvwfmRchLXak | 13 | 23-768-687-3665 | 121.65 | AUTOMOBILE | l accounts. blithely ironic theodolites integrate boldly: caref 3 | Customer#000000003 | MG9kdTD2WBHm | 1 | 11-719-748-3364 | 7498.12 | AUTOMOBILE | deposits eat slyly ironic, even instructions. express foxes detect slyly. blithely even accounts abov 4 | Customer#000000004 | XxVSJsLAGtn | 4 | 14-128-190-5944 | 2866.83 | MACHINERY | requests. final, regular ideas sleep final accou 5 | Customer#000000005 | KvpyuHCplrB84WgAiGV6sYpZq7Tj | 3 | 13-750-942-6364 | 794.47 | HOUSEHOLD | n accounts will have to unwind. foxes cajole accor 6 | Customer#000000006 | sKZz0CsnMD7mp4Xd0YrBvx,LREYKUWAh yVn | 20 | 30-114-968-4951 | 7638.57 | AUTOMOBILE | tions. even deposits boost according to the slyly bold packages. final accounts cajole requests. furious 7 | Customer#000000007 | TcGe5gaZNgVePxU5kRrvXBfkasDTea | 18 | 28-190-982-9759 | 9561.95 | AUTOMOBILE | ainst the ironic, express theodolites. express, even pinto beans among the exp 8 | Customer#000000008 | I0B10bB0AymmC, 0PrRYBCP1yGJ8xcBPmWhl5 | 17 | 27-147-574-9335 | 6819.74 | BUILDING | among the slyly regular theodolites kindle blithely courts. carefully even theodolites haggle slyly along the ide 9 | Customer#000000009 | xKiAFTjUsCuxfeleNqefumTrjS | 8 | 18-338-906-3675 | 8324.07 | FURNITURE | r theodolites according to the requests wake thinly excuses: pending requests haggle furiousl 10 | Customer#000000010 | 6LrEaV6KR6PLVcgl2ArL Q3rqzLzcT1 v2 | 5 | 15-741-346-9870 | 2753.54 | HOUSEHOLD | es regular deposits haggle. fur (10 rows) The output on your screen will look similar to the following: The arrows on the far right side indicate that there is more output to view. Press the right and left arrows on your keyboard to scroll the display. If the result set is small, all of the results will display on the screen and no scrolling will be available unless the results are wider than the screen size. When thje display shows (END) you have reached the bottom of the output. If the display shows a colon ( : ) at the bottom of the screen, you can use the up and down arrow keys to scroll a record at a time, or the Page Up and Page Down keys to scroll a page at a time. To quit viewing the output, press the Q key. Quit the Presto CLI. The Presto quit command can be used with or without a semicolon. quit; Congratulations, your system is now up and running!","title":"Presto Engine Test"},{"location":"wxd-minio/","text":"Using the MinIO console UI MinIO is a high-performance, S3 compatible object store. Rather than connect to an external S3 object store, we are going to use MinIO locally to run with watsonx.data. To connect to MinIO, you will need to extract the MinIO credentials by querying the docker container. You must be the root user to issue these commands. export LH_S3_ACCESS_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_ACCESS_KEY | sed 's/.*=//') export LH_S3_SECRET_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_SECRET_KEY | sed 's/.*=//') echo \"MinIO Userid : \" $LH_S3_ACCESS_KEY echo \"MinIO Password: \" $LH_S3_SECRET_KEY MinIO Userid : c4643026087cc21989eb5c12 MinIO Password: 93da45c5af87abd86c9dbc83 You can get all passwords for the system when you are logged in as the watsonx user by using the following command. cat /certs/passwords Your TechZone reservation will include the server name and port number to use when connecting to the MinIO. The default port number is 9001, while the server will be referred to as region.techzone-server.com . Replace these values with those found in your reservation. Open your browser and navigate to: Minio console - http://region.techzone-server.com:port Note : Firefox on OSX occasionally freezes when connecting to the MinIO console. The Safari browser is much more reliable. Login with object store credentials found above (These will be different for your system). You should see current buckets in MinIO. We are going to examine these buckets after we populate them with some data. Creating Schemas and Tables Not all catalogs support creation of schemas - as an example, the TPCH catalog is not writeable. We will use the iceberg_data catalog for this exercise. We will need to get some details before we continue. Make sure you are connected as the root user and are in the proper directory. cd /root/ibm-lh-dev/bin Login to the Presto CLI. ./presto-cli --catalog iceberg_data Create schema workshop in catalog iceberg_data . Note how we are using the iceberg-bucket bucket which you should have seen in the MinIO object browser. CREATE SCHEMA IF NOT EXISTS workshop with (location='s3a://iceberg-bucket/'); Show the schemas available. show schemas; Schema ---------- workshop (1 row) Use the workshop schema. use workshop; Creating tables Create a new Apache Iceberg table using existing data in the sample Customer table as part of the TPCH catalog schema called TINY. create table customer as select * from tpch.tiny.customer; Show the tables. show tables; Table ---------- customer (1 row) Quit Presto. quit; \u2003 Refresh the Minio screen (see button on the far-right side). You should now see new objects under iceberg-bucket Click on the bucket name and you will see the customer table. Selecting the customer object will show that there is data and metadata in there. How do we know that this data is based on Apache iceberg? If you open the file under metadata , you should see metadata information for the data we are storing in parquet file format. Do I really need Apache Iceberg? YES, YOU DO! However, it is good to understand why? Metadata is also stored in the Parquet file format but only for the single parquet file. If we add more data/partitions, the data is split into multiple Parquet files, and we don\u2019t have a mechanism to get the table to parquet files mapping. Run the following example to understand this better. You need to get the access keys for MinIO before running the following lab. Make sure you are still connected as root . export LH_S3_ACCESS_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_ACCESS_KEY | sed 's/.*=//') export LH_S3_SECRET_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_SECRET_KEY | sed 's/.*=//') Open the developer sandbox to connect to MinIO, download the selected parquet file and inspect the parquet file contents. ./dev-sandbox.sh Update the Python files to be executable (makes our commands more convenient). chmod +x /scripts/*.py List all files in the object store (MinIO). /scripts/s3-inspect.py --host ibm-lh-minio-svc:9000 --accessKey $LH_S3_ACCESS_KEY --secretKey $LH_S3_SECRET_KEY --bucket iceberg-bucket iceberg-bucket b'customer/data/e9536a5e-14a1-4823-98ed-cc22d6fc38db.parquet' 2023-06-06 14:31:47.778000+00:00 6737d7268fcb3eb459b675f27f716f48 75373 None iceberg-bucket b'customer/metadata/00000-e26c56e0-c4d7-4625-8b06-422429f6ba8d.metadata.json' 2023-06-06 14:31:48.629000+00:00 2e722c7dd83c1dd260a7e6c9503c0e04 3272 None iceberg-bucket b'customer/metadata/7cb074a4-3da7-4184-9db8-567383bb588a-m0.avro' 2023-06-06 14:31:48.401000+00:00 655a5568207cc399b8297f1488ef77e7 6342 None iceberg-bucket b'customer/metadata/snap-6143645832277262458-1-7cb074a4-3da7-4184-9db8-567383bb588a.avro' 2023-06-06 14:31:48.445000+00:00 0c3714299d43ae86a46eabdcaac1351e 3753 None You can extract the string with the following command. PARQUET=$(/scripts/s3-inspect.py --host ibm-lh-minio-svc:9000 --accessKey $LH_S3_ACCESS_KEY --secretKey $LH_S3_SECRET_KEY --bucket iceberg-bucket | grep -o -m 1 \".*'customer.*parquet\" | sed -n \"s/.*b'//p\") The file name that is retrieved is substituted into the next command. Note: The file name found in $PARQUET will be different on your system. /scripts/s3-download.py --host ibm-lh-minio-svc:9000 --accessKey $LH_S3_ACCESS_KEY --secretKey $LH_S3_SECRET_KEY --bucket iceberg-bucket --srcFile $PARQUET --destFile /tmp/x.parquet \u2003 Describe the File Contents. /scripts/describe-parquet.py /tmp/x.parquet ---------------------- metadata: created_by: num_columns: 8 num_rows: 1500 num_row_groups: 1 format_version: 1.0 serialized_size: 851 ---------------------- ---------------------- schema: custkey: int64 name: binary address: binary nationkey: int64 phone: binary acctbal: double mktsegment: binary comment: binary ---------------------- ---------------------- row group 0: num_columns: 8 num_rows: 1500 total_byte_size: 74555 ---------------------- ---------------------- row group 0, column 1: file_offset: 0 file_path: physical_type: BYTE_ARRAY num_values: 1500 path_in_schema: name is_stats_set: True statistics: has_min_max: False min: None max: None null_count: 0 distinct_count: 0 num_values: 1500 physical_type: BYTE_ARRAY logical_type: None converted_type (legacy): NONE compression: GZIP encodings: ('DELTA_BYTE_ARRAY',) has_dictionary_page: False dictionary_page_offset: None data_page_offset: 112 total_compressed_size: 599 total_uncompressed_size: 2806 ---------------------- Note : In this instance we used an insert into select * from customer with no partitioning defined there was only 1 parquet file and only 1 row group. This is not the norm, and we deliberately did this to show you the value of using Apache Iceberg file format which can be used by multiple runtimes to access Iceberg data stored in parquet format and managed by hive metastore. Exit from the Sandbox. exit MinIO CLI The MinIO Client mc command line tool provides an alternative to UNIX commands like ls , cat , cp , mirror , and diff with support for both file systems and Amazon S3-compatible cloud storage services. The mc commandline tool is built for compatibility with the AWS S3 API and is tested with MinIO and AWS S3 for expected functionality and behavior. Complete details and restrictions around the use of the CLI command can be found on the MinIO Client page. You can use the MinIO CLI from a variety of clients. The MinIO ports are open in the developer edition image, which provides an alternative to loading data directly from your workstation rather than using the MinIO UI interface. Minio System Alias Before running commands against the MinIO server, an alias must be created that includes the access and secret key. The values can be extracted from the system by listing the contents of the /certs/passwords file or by running the passwords command as the root user. cat /certs/passwords The values for the MinIO access and secret key can also be exported with the following code: export LH_S3_ACCESS_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_ACCESS_KEY | sed 's/.*=//') export LH_S3_SECRET_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_SECRET_KEY | sed 's/.*=//') echo \"MinIO Userid : \" $LH_S3_ACCESS_KEY echo \"MinIO Password: \" $LH_S3_SECRET_KEY The alias command has the following syntax: mc alias set alias-name hostname:port access_key secret_key For a local connection, we will use the following values: Alias Name - watsonxdata Hostname \u2013 watsonxdata Port \u2013 9000 Access Key \u2013 $LH_S3_ACCESS_KEY Secret Key - $LH_S3_SECRET_KEY If you are using an external client to connect to the MinIO service, you will need the URL and Port number from the TechZone reservation. The access key and secret key will be the same values that are found above. Hostname \u2013 region.techzone-server.com Port \u2013 12345 The alias for local access is found below. mc alias set watsonxdata http://watsonxdata:9000 $LH_S3_ACCESS_KEY $LH_S3_SECRET_KEY Added `watsonxdata` successfully. List Buckets The mc command provides us with a number of commands that allows us to manage buckets and files within them. The following command checks to see what buckets currently exist in the system. mc ls tree watsonxdata [2023-09-29 14:38:19 EDT] 0B hive-bucket/ [2023-09-29 14:38:19 EDT] 0B iceberg-bucket/ You can view the contents of a bucket by traversing down the path. mc ls tree watsonxdata/hive-bucket [2023-10-13 10:34:36 EDT] 0B gosalesdw/ [2023-10-13 10:34:36 EDT] 0B hive_sql/ [2023-10-13 10:34:36 EDT] 0B ontime/ [2023-10-13 10:34:36 EDT] 0B taxi/ Create a Bucket At this point we will create a new bucket to hold some data. Use the mb (make bucket) command. The command requires the alias name for the MinIO connection followed by the name of the bucket. mc mb alias-name/new-bucket The following code will create a new bucket in the system called sampledata . mc mb watsonxdata/sampledata Bucket created successfully `watsonxdata/sampledata`. We can double check that the bucket it there. mc ls tree watsonxdata [2023-09-29 14:38:19 EDT] 0B hive-bucket/ [2023-09-29 14:38:19 EDT] 0B iceberg-bucket/ [2023-10-13 10:39:47 EDT] 0B sampledata/ Loading Data One of the most powerful features of the MinIO CLI is its ability to load data directory from your workstation into the bucket, rather than having to use the MinIO UI. It is also significantly faster than using the UI interface. The next example will load data into the bucket that was just created. The directory that we will be using to load data from is called /sampledata and found in the root directory of the watsonx.data server. ls /sampledata/csv gosales ontime taxi Next we will load the data from each one of these directories into the sampledata bucket. The mc command allows you to select which files to place into a bucket, or an entire directory with recursion. In this case we are loading all three directories the files into the bucket. Note the use of the / at the end of the directory name to prevent the directory name csv from being used as the high-level directory name in the target bucket. mc cp --recursive /sampledata/csv/ watsonxdata/sampledata/ ...data/csv/taxi/taxi.csv: 306.16 MiB / 306.16 MiB \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 147.91 MiB/s 2s We can double-check that our files are there with the --files option: mc tree --files watsonxdata/sampledata/ watsonxdata/sampledata/ \u251c\u2500 gosales \u2502 \u251c\u2500 DIST_INVENTORY_FACT.csv \u2502 \u251c\u2500 DIST_PRODUCT_FORECAST_FACT.csv \u2502 \u251c\u2500 DIST_RETURNED_ITEMS_FACT.csv \u2502 \u251c\u2500 DIST_RETURN_REASON_DIM.csv .... \u2502 \u251c\u2500 EMP_EMPLOYEE_DIM.csv \u2502 \u251c\u2500 SLS_SALES_TARG_FACT.csv \u2502 \u251c\u2500 gosales_createtable.sql \u2502 \u2514\u2500 gosales_load_postgres.sql \u251c\u2500 ontime \u2502 \u251c\u2500 aircraft.csv \u2502 \u251c\u2500 airline_id.csv \u2502 \u251c\u2500 airport_id.csv \u2502 \u251c\u2500 cancellation.csv \u2502 \u2514\u2500 ontime.csv \u2514\u2500 taxi \u2514\u2500 taxi.csv Delete a File or Bucket Use the rb (Remove bucket) command to remove a bucket and its contents. You can remove individual objects by using the rm (Remove) command by fully qualifying the object. The next command will remove the ontime.csv file from the ontime folder. mc rm watsonxdata/sampledata/ontime/ontime.csv Removed `watsonxdata/sampledata/ontime/ontime.csv`. The delete bucket command will fail if you still have data in the bucket. mc rb watsonxdata/sampledata mc: `watsonxdata/sampledata` is not empty. Retry this command with \u2018--force\u2019 flag if you want to remove `watsonxdata/sampledata` and all its contents Adding the --force option will remove the bucket and all the data in it. Use with caution! mc rb --force watsonxdata/sampledata Removed `watsonxdata/sampledata` successfully.","title":"MinIO UI"},{"location":"wxd-minio/#using-the-minio-console-ui","text":"MinIO is a high-performance, S3 compatible object store. Rather than connect to an external S3 object store, we are going to use MinIO locally to run with watsonx.data. To connect to MinIO, you will need to extract the MinIO credentials by querying the docker container. You must be the root user to issue these commands. export LH_S3_ACCESS_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_ACCESS_KEY | sed 's/.*=//') export LH_S3_SECRET_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_SECRET_KEY | sed 's/.*=//') echo \"MinIO Userid : \" $LH_S3_ACCESS_KEY echo \"MinIO Password: \" $LH_S3_SECRET_KEY MinIO Userid : c4643026087cc21989eb5c12 MinIO Password: 93da45c5af87abd86c9dbc83 You can get all passwords for the system when you are logged in as the watsonx user by using the following command. cat /certs/passwords Your TechZone reservation will include the server name and port number to use when connecting to the MinIO. The default port number is 9001, while the server will be referred to as region.techzone-server.com . Replace these values with those found in your reservation. Open your browser and navigate to: Minio console - http://region.techzone-server.com:port Note : Firefox on OSX occasionally freezes when connecting to the MinIO console. The Safari browser is much more reliable. Login with object store credentials found above (These will be different for your system). You should see current buckets in MinIO. We are going to examine these buckets after we populate them with some data.","title":"Using the MinIO console UI"},{"location":"wxd-minio/#creating-schemas-and-tables","text":"Not all catalogs support creation of schemas - as an example, the TPCH catalog is not writeable. We will use the iceberg_data catalog for this exercise. We will need to get some details before we continue. Make sure you are connected as the root user and are in the proper directory. cd /root/ibm-lh-dev/bin Login to the Presto CLI. ./presto-cli --catalog iceberg_data Create schema workshop in catalog iceberg_data . Note how we are using the iceberg-bucket bucket which you should have seen in the MinIO object browser. CREATE SCHEMA IF NOT EXISTS workshop with (location='s3a://iceberg-bucket/'); Show the schemas available. show schemas; Schema ---------- workshop (1 row) Use the workshop schema. use workshop;","title":"Creating Schemas and Tables"},{"location":"wxd-minio/#creating-tables","text":"Create a new Apache Iceberg table using existing data in the sample Customer table as part of the TPCH catalog schema called TINY. create table customer as select * from tpch.tiny.customer; Show the tables. show tables; Table ---------- customer (1 row) Quit Presto. quit; \u2003 Refresh the Minio screen (see button on the far-right side). You should now see new objects under iceberg-bucket Click on the bucket name and you will see the customer table. Selecting the customer object will show that there is data and metadata in there. How do we know that this data is based on Apache iceberg? If you open the file under metadata , you should see metadata information for the data we are storing in parquet file format.","title":"Creating tables"},{"location":"wxd-minio/#do-i-really-need-apache-iceberg","text":"YES, YOU DO! However, it is good to understand why? Metadata is also stored in the Parquet file format but only for the single parquet file. If we add more data/partitions, the data is split into multiple Parquet files, and we don\u2019t have a mechanism to get the table to parquet files mapping. Run the following example to understand this better. You need to get the access keys for MinIO before running the following lab. Make sure you are still connected as root . export LH_S3_ACCESS_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_ACCESS_KEY | sed 's/.*=//') export LH_S3_SECRET_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_SECRET_KEY | sed 's/.*=//') Open the developer sandbox to connect to MinIO, download the selected parquet file and inspect the parquet file contents. ./dev-sandbox.sh Update the Python files to be executable (makes our commands more convenient). chmod +x /scripts/*.py List all files in the object store (MinIO). /scripts/s3-inspect.py --host ibm-lh-minio-svc:9000 --accessKey $LH_S3_ACCESS_KEY --secretKey $LH_S3_SECRET_KEY --bucket iceberg-bucket iceberg-bucket b'customer/data/e9536a5e-14a1-4823-98ed-cc22d6fc38db.parquet' 2023-06-06 14:31:47.778000+00:00 6737d7268fcb3eb459b675f27f716f48 75373 None iceberg-bucket b'customer/metadata/00000-e26c56e0-c4d7-4625-8b06-422429f6ba8d.metadata.json' 2023-06-06 14:31:48.629000+00:00 2e722c7dd83c1dd260a7e6c9503c0e04 3272 None iceberg-bucket b'customer/metadata/7cb074a4-3da7-4184-9db8-567383bb588a-m0.avro' 2023-06-06 14:31:48.401000+00:00 655a5568207cc399b8297f1488ef77e7 6342 None iceberg-bucket b'customer/metadata/snap-6143645832277262458-1-7cb074a4-3da7-4184-9db8-567383bb588a.avro' 2023-06-06 14:31:48.445000+00:00 0c3714299d43ae86a46eabdcaac1351e 3753 None You can extract the string with the following command. PARQUET=$(/scripts/s3-inspect.py --host ibm-lh-minio-svc:9000 --accessKey $LH_S3_ACCESS_KEY --secretKey $LH_S3_SECRET_KEY --bucket iceberg-bucket | grep -o -m 1 \".*'customer.*parquet\" | sed -n \"s/.*b'//p\") The file name that is retrieved is substituted into the next command. Note: The file name found in $PARQUET will be different on your system. /scripts/s3-download.py --host ibm-lh-minio-svc:9000 --accessKey $LH_S3_ACCESS_KEY --secretKey $LH_S3_SECRET_KEY --bucket iceberg-bucket --srcFile $PARQUET --destFile /tmp/x.parquet \u2003 Describe the File Contents. /scripts/describe-parquet.py /tmp/x.parquet ---------------------- metadata: created_by: num_columns: 8 num_rows: 1500 num_row_groups: 1 format_version: 1.0 serialized_size: 851 ---------------------- ---------------------- schema: custkey: int64 name: binary address: binary nationkey: int64 phone: binary acctbal: double mktsegment: binary comment: binary ---------------------- ---------------------- row group 0: num_columns: 8 num_rows: 1500 total_byte_size: 74555 ---------------------- ---------------------- row group 0, column 1: file_offset: 0 file_path: physical_type: BYTE_ARRAY num_values: 1500 path_in_schema: name is_stats_set: True statistics: has_min_max: False min: None max: None null_count: 0 distinct_count: 0 num_values: 1500 physical_type: BYTE_ARRAY logical_type: None converted_type (legacy): NONE compression: GZIP encodings: ('DELTA_BYTE_ARRAY',) has_dictionary_page: False dictionary_page_offset: None data_page_offset: 112 total_compressed_size: 599 total_uncompressed_size: 2806 ---------------------- Note : In this instance we used an insert into select * from customer with no partitioning defined there was only 1 parquet file and only 1 row group. This is not the norm, and we deliberately did this to show you the value of using Apache Iceberg file format which can be used by multiple runtimes to access Iceberg data stored in parquet format and managed by hive metastore. Exit from the Sandbox. exit","title":"Do I really need Apache Iceberg?"},{"location":"wxd-minio/#minio-cli","text":"The MinIO Client mc command line tool provides an alternative to UNIX commands like ls , cat , cp , mirror , and diff with support for both file systems and Amazon S3-compatible cloud storage services. The mc commandline tool is built for compatibility with the AWS S3 API and is tested with MinIO and AWS S3 for expected functionality and behavior. Complete details and restrictions around the use of the CLI command can be found on the MinIO Client page. You can use the MinIO CLI from a variety of clients. The MinIO ports are open in the developer edition image, which provides an alternative to loading data directly from your workstation rather than using the MinIO UI interface.","title":"MinIO CLI"},{"location":"wxd-minio/#minio-system-alias","text":"Before running commands against the MinIO server, an alias must be created that includes the access and secret key. The values can be extracted from the system by listing the contents of the /certs/passwords file or by running the passwords command as the root user. cat /certs/passwords The values for the MinIO access and secret key can also be exported with the following code: export LH_S3_ACCESS_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_ACCESS_KEY | sed 's/.*=//') export LH_S3_SECRET_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_SECRET_KEY | sed 's/.*=//') echo \"MinIO Userid : \" $LH_S3_ACCESS_KEY echo \"MinIO Password: \" $LH_S3_SECRET_KEY The alias command has the following syntax: mc alias set alias-name hostname:port access_key secret_key For a local connection, we will use the following values: Alias Name - watsonxdata Hostname \u2013 watsonxdata Port \u2013 9000 Access Key \u2013 $LH_S3_ACCESS_KEY Secret Key - $LH_S3_SECRET_KEY If you are using an external client to connect to the MinIO service, you will need the URL and Port number from the TechZone reservation. The access key and secret key will be the same values that are found above. Hostname \u2013 region.techzone-server.com Port \u2013 12345 The alias for local access is found below. mc alias set watsonxdata http://watsonxdata:9000 $LH_S3_ACCESS_KEY $LH_S3_SECRET_KEY Added `watsonxdata` successfully.","title":"Minio System Alias"},{"location":"wxd-minio/#list-buckets","text":"The mc command provides us with a number of commands that allows us to manage buckets and files within them. The following command checks to see what buckets currently exist in the system. mc ls tree watsonxdata [2023-09-29 14:38:19 EDT] 0B hive-bucket/ [2023-09-29 14:38:19 EDT] 0B iceberg-bucket/ You can view the contents of a bucket by traversing down the path. mc ls tree watsonxdata/hive-bucket [2023-10-13 10:34:36 EDT] 0B gosalesdw/ [2023-10-13 10:34:36 EDT] 0B hive_sql/ [2023-10-13 10:34:36 EDT] 0B ontime/ [2023-10-13 10:34:36 EDT] 0B taxi/","title":"List Buckets"},{"location":"wxd-minio/#create-a-bucket","text":"At this point we will create a new bucket to hold some data. Use the mb (make bucket) command. The command requires the alias name for the MinIO connection followed by the name of the bucket. mc mb alias-name/new-bucket The following code will create a new bucket in the system called sampledata . mc mb watsonxdata/sampledata Bucket created successfully `watsonxdata/sampledata`. We can double check that the bucket it there. mc ls tree watsonxdata [2023-09-29 14:38:19 EDT] 0B hive-bucket/ [2023-09-29 14:38:19 EDT] 0B iceberg-bucket/ [2023-10-13 10:39:47 EDT] 0B sampledata/","title":"Create a Bucket"},{"location":"wxd-minio/#loading-data","text":"One of the most powerful features of the MinIO CLI is its ability to load data directory from your workstation into the bucket, rather than having to use the MinIO UI. It is also significantly faster than using the UI interface. The next example will load data into the bucket that was just created. The directory that we will be using to load data from is called /sampledata and found in the root directory of the watsonx.data server. ls /sampledata/csv gosales ontime taxi Next we will load the data from each one of these directories into the sampledata bucket. The mc command allows you to select which files to place into a bucket, or an entire directory with recursion. In this case we are loading all three directories the files into the bucket. Note the use of the / at the end of the directory name to prevent the directory name csv from being used as the high-level directory name in the target bucket. mc cp --recursive /sampledata/csv/ watsonxdata/sampledata/ ...data/csv/taxi/taxi.csv: 306.16 MiB / 306.16 MiB \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 147.91 MiB/s 2s We can double-check that our files are there with the --files option: mc tree --files watsonxdata/sampledata/ watsonxdata/sampledata/ \u251c\u2500 gosales \u2502 \u251c\u2500 DIST_INVENTORY_FACT.csv \u2502 \u251c\u2500 DIST_PRODUCT_FORECAST_FACT.csv \u2502 \u251c\u2500 DIST_RETURNED_ITEMS_FACT.csv \u2502 \u251c\u2500 DIST_RETURN_REASON_DIM.csv .... \u2502 \u251c\u2500 EMP_EMPLOYEE_DIM.csv \u2502 \u251c\u2500 SLS_SALES_TARG_FACT.csv \u2502 \u251c\u2500 gosales_createtable.sql \u2502 \u2514\u2500 gosales_load_postgres.sql \u251c\u2500 ontime \u2502 \u251c\u2500 aircraft.csv \u2502 \u251c\u2500 airline_id.csv \u2502 \u251c\u2500 airport_id.csv \u2502 \u251c\u2500 cancellation.csv \u2502 \u2514\u2500 ontime.csv \u2514\u2500 taxi \u2514\u2500 taxi.csv","title":"Loading Data"},{"location":"wxd-minio/#delete-a-file-or-bucket","text":"Use the rb (Remove bucket) command to remove a bucket and its contents. You can remove individual objects by using the rm (Remove) command by fully qualifying the object. The next command will remove the ontime.csv file from the ontime folder. mc rm watsonxdata/sampledata/ontime/ontime.csv Removed `watsonxdata/sampledata/ontime/ontime.csv`. The delete bucket command will fail if you still have data in the bucket. mc rb watsonxdata/sampledata mc: `watsonxdata/sampledata` is not empty. Retry this command with \u2018--force\u2019 flag if you want to remove `watsonxdata/sampledata` and all its contents Adding the --force option will remove the bucket and all the data in it. Use with caution! mc rb --force watsonxdata/sampledata Removed `watsonxdata/sampledata` successfully.","title":"Delete a File or Bucket"},{"location":"wxd-objectstore/","text":"Working with Object Store Buckets In this lab, we will run through some exercises to understand how the watsonx.data can be configured to work with multiple buckets, using IBM COS, in addition to the out of the box MinIO bucket. In the GA version, there will be a user experience to facilitate such setup, however this lab will help you understand some service-service interactions & configurations. Why do we need to do this? In this lab, we will use multiple buckets as this is also how we can illustrate compute-storage separation. Out of the box, both in SaaS and Software, a tiny Object Store bucket is allocated, primarily for getting started use cases. Customers would need to point to their own bucket for their data. The use of a remote bucket (in this example, MinIO) also showcases the \"open\" aspect of the watsonx.data system. Customers own their data and can physically access the iceberg-ed bucket using other applications or engines, even custom ones that they build themselves. Customers would also have requirements to place (data sovereignty) buckets in specific locations. Compute/analytics engines may need to run in different locations, say closer to applications and connect to buckets in other networks/geos. There will also be situations where the same engine federates data across multiple buckets (and other database connections). As part of the GA release, there will also be authorization & data access rules that will control which user/group can access buckets even within the same engine. In Enterprise/Production environments, engines are expected to be ephemeral or there can be multiple engines. These engines when they come up will connect to different object store buckets. The list of engines will include Db2, NZ, IBM Analytics Engine for Spark, apart from Presto. The shared meta-store is critical in all of this as it helps provide relevant schema information to the engines. Create new bucket in MinIO Open your browser and navigate to the MinIO console. Check to see if the MinIO credentials exist in your terminal session. printf \"\\nAccess Key: $LH_S3_ACCESS_KEY \\nSecret Key: $LH_S3_SECRET_KEY\\n\" Userid : fcf1ec270e05a5031ca27bc9 Password: a671febd9e1e3826cf8cdcf5 If these values are blank, you need to run the following command. export LH_S3_ACCESS_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_ACCESS_KEY | sed 's/.*=//') export LH_S3_SECRET_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_SECRET_KEY | sed 's/.*=//') Click on the Buckets tab to show the current buckets in the MinIO system. You can see that we have two buckets used for the labs. We need to create a new bucket to use for our schema. Press the \"Create Bucket +\" option on the right side of the screen. Note : The size and contents of the existing buckets will be different on your system. Enter a bucket name (customer) and then press Create Bucket. You should now see your new bucket below. Open your browser and connect to the watsonx.data UI: Navigate to the Infrastructure manager by clicking on the icon below the Home symbol. Get the S3 bucket credentials. printf \"\\nAccess Key: $LH_S3_ACCESS_KEY \\nSecret Key: $LH_S3_SECRET_KEY\\n\" Click on the Add component menu and select Add bucket. Fill in the dialog with the following values. Bucket type \u2013 MinIO Bucket name \u2013 customer Display name \u2013 customer Endpoint \u2013 http://ibm-lh-minio-svc:9000 Access key \u2013 $LH_S3_ACCESS_KEY (contents of this value) Secret key \u2013 $LH_S3_SECRET_KEY (contents of this value) Activate now \u2013 Yes Catalog type - Apache Iceberg Catalog name - customer When done press Add and Activate now. Your UI should change to display the new bucket (Your screen may be slightly different). Note : This step may take a minute to complete. At this point you need to Associate the bucket with the Presto engine. When you hover your mouse over the Customer catalog and the Associate icon will display. If you do not see the Associate icon, refresh the browser page. Press the associate button and the following dialog will display. Select the presto-01 engine and then press the Save and restart engine button. Associate button and wait for the screen to refresh. Note : Your display will be different. Exploring the Customer bucket First check to make sure that the Presto engine has finished starting. While the watsonx.data UI has restarted the Presto process, it takes a few seconds to become available. check_presto Switch to the bin directory as the root user. sudo su - cd /root/ibm-lh-dev/bin Connect to Presto using the new customer catalog. ./presto-cli --catalog customer We will create a schema where we store our table data using the new catalog name we created for the customer bucket. CREATE SCHEMA IF NOT EXISTS newworkshop with (location='s3a://customer/'); Switch to the new schema. use newworkshop; Use the following SQL to create a new table in the customer bucket. create table customer as select * from tpch.tiny.customer; CREATE TABLE: 1500 rows Quit Presto. quit; You can use the Developer sandbox (bin/dev-sandbox.sh), as described in MinIO UI , to inspect the Customer bucket with the s3-inspect utility. It's easier to use the MinIO console to view the bucket instead. Open your browser and navigate to the MinIO console. From the main screen select Object Browser and view the contents of the customer bucket. Note : You can continue to add new buckets when working with the watsonx.data UI. However, if you delete the catalog or bucket in the UI, you may find that you may not be able to re-catalog it. If you find that this happens, create another bucket, or rename the original one if that is possible.","title":"Working with Object Store Buckets"},{"location":"wxd-objectstore/#working-with-object-store-buckets","text":"In this lab, we will run through some exercises to understand how the watsonx.data can be configured to work with multiple buckets, using IBM COS, in addition to the out of the box MinIO bucket. In the GA version, there will be a user experience to facilitate such setup, however this lab will help you understand some service-service interactions & configurations.","title":"Working with Object Store Buckets"},{"location":"wxd-objectstore/#why-do-we-need-to-do-this","text":"In this lab, we will use multiple buckets as this is also how we can illustrate compute-storage separation. Out of the box, both in SaaS and Software, a tiny Object Store bucket is allocated, primarily for getting started use cases. Customers would need to point to their own bucket for their data. The use of a remote bucket (in this example, MinIO) also showcases the \"open\" aspect of the watsonx.data system. Customers own their data and can physically access the iceberg-ed bucket using other applications or engines, even custom ones that they build themselves. Customers would also have requirements to place (data sovereignty) buckets in specific locations. Compute/analytics engines may need to run in different locations, say closer to applications and connect to buckets in other networks/geos. There will also be situations where the same engine federates data across multiple buckets (and other database connections). As part of the GA release, there will also be authorization & data access rules that will control which user/group can access buckets even within the same engine. In Enterprise/Production environments, engines are expected to be ephemeral or there can be multiple engines. These engines when they come up will connect to different object store buckets. The list of engines will include Db2, NZ, IBM Analytics Engine for Spark, apart from Presto. The shared meta-store is critical in all of this as it helps provide relevant schema information to the engines.","title":"Why do we need to do this?"},{"location":"wxd-objectstore/#create-new-bucket-in-minio","text":"Open your browser and navigate to the MinIO console. Check to see if the MinIO credentials exist in your terminal session. printf \"\\nAccess Key: $LH_S3_ACCESS_KEY \\nSecret Key: $LH_S3_SECRET_KEY\\n\" Userid : fcf1ec270e05a5031ca27bc9 Password: a671febd9e1e3826cf8cdcf5 If these values are blank, you need to run the following command. export LH_S3_ACCESS_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_ACCESS_KEY | sed 's/.*=//') export LH_S3_SECRET_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_SECRET_KEY | sed 's/.*=//') Click on the Buckets tab to show the current buckets in the MinIO system. You can see that we have two buckets used for the labs. We need to create a new bucket to use for our schema. Press the \"Create Bucket +\" option on the right side of the screen. Note : The size and contents of the existing buckets will be different on your system. Enter a bucket name (customer) and then press Create Bucket. You should now see your new bucket below. Open your browser and connect to the watsonx.data UI: Navigate to the Infrastructure manager by clicking on the icon below the Home symbol. Get the S3 bucket credentials. printf \"\\nAccess Key: $LH_S3_ACCESS_KEY \\nSecret Key: $LH_S3_SECRET_KEY\\n\" Click on the Add component menu and select Add bucket. Fill in the dialog with the following values. Bucket type \u2013 MinIO Bucket name \u2013 customer Display name \u2013 customer Endpoint \u2013 http://ibm-lh-minio-svc:9000 Access key \u2013 $LH_S3_ACCESS_KEY (contents of this value) Secret key \u2013 $LH_S3_SECRET_KEY (contents of this value) Activate now \u2013 Yes Catalog type - Apache Iceberg Catalog name - customer When done press Add and Activate now. Your UI should change to display the new bucket (Your screen may be slightly different). Note : This step may take a minute to complete. At this point you need to Associate the bucket with the Presto engine. When you hover your mouse over the Customer catalog and the Associate icon will display. If you do not see the Associate icon, refresh the browser page. Press the associate button and the following dialog will display. Select the presto-01 engine and then press the Save and restart engine button. Associate button and wait for the screen to refresh. Note : Your display will be different.","title":"Create new bucket in MinIO"},{"location":"wxd-objectstore/#exploring-the-customer-bucket","text":"First check to make sure that the Presto engine has finished starting. While the watsonx.data UI has restarted the Presto process, it takes a few seconds to become available. check_presto Switch to the bin directory as the root user. sudo su - cd /root/ibm-lh-dev/bin Connect to Presto using the new customer catalog. ./presto-cli --catalog customer We will create a schema where we store our table data using the new catalog name we created for the customer bucket. CREATE SCHEMA IF NOT EXISTS newworkshop with (location='s3a://customer/'); Switch to the new schema. use newworkshop; Use the following SQL to create a new table in the customer bucket. create table customer as select * from tpch.tiny.customer; CREATE TABLE: 1500 rows Quit Presto. quit; You can use the Developer sandbox (bin/dev-sandbox.sh), as described in MinIO UI , to inspect the Customer bucket with the s3-inspect utility. It's easier to use the MinIO console to view the bucket instead. Open your browser and navigate to the MinIO console. From the main screen select Object Browser and view the contents of the customer bucket. Note : You can continue to add new buckets when working with the watsonx.data UI. However, if you delete the catalog or bucket in the UI, you may find that you may not be able to re-catalog it. If you find that this happens, create another bucket, or rename the original one if that is possible.","title":"Exploring the Customer bucket"},{"location":"wxd-presto/","text":"Using the Presto console UI Your TechZone reservation will include the server name and port number to use when connecting to the Presto UI. The default port number is 8443 and the server will be referred to as region.techzone-server.com . Replace these values with those found in your reservation. Presto console - https://region.techzone-server.com:port Credentials: username: ibmlhadmin password: password The Presto console allows you to do the following: Monitor state of the cluster Queries being executed Queries in queue Data throughput Query details (text and plan) Note : The Presto console is very valuable when it comes to diagnosing problems with any queries you run in the watsonx.data environment. If a query fails you can find more details in the Presto console using the instructions below. On the main Presto screen, click the Finished Button (middle of the screen). A list of finished queries will display below the tab bar. You can scroll through the list of queries and get details of the execution plans. If you scroll through the list, you should see the test query \"select * from customer limit 5\". If you had a query that failed, look for the SQL in this list and continue on with the next step. Click on the query ID to see details of the execution plan that Presto produced. You can get more information about the query by clicking on any of the tabs that are on this screen. For instance, the Live Plan tab will show a visual explain of the stages that the query went through during execution. Scrolling to the bottom of this screen will also display any error messages that may have been produced by the SQL. Take time to check out the other information that is available for the query including the stage performance.","title":"Presto UI"},{"location":"wxd-presto/#using-the-presto-console-ui","text":"Your TechZone reservation will include the server name and port number to use when connecting to the Presto UI. The default port number is 8443 and the server will be referred to as region.techzone-server.com . Replace these values with those found in your reservation. Presto console - https://region.techzone-server.com:port Credentials: username: ibmlhadmin password: password The Presto console allows you to do the following: Monitor state of the cluster Queries being executed Queries in queue Data throughput Query details (text and plan) Note : The Presto console is very valuable when it comes to diagnosing problems with any queries you run in the watsonx.data environment. If a query fails you can find more details in the Presto console using the instructions below. On the main Presto screen, click the Finished Button (middle of the screen). A list of finished queries will display below the tab bar. You can scroll through the list of queries and get details of the execution plans. If you scroll through the list, you should see the test query \"select * from customer limit 5\". If you had a query that failed, look for the SQL in this list and continue on with the next step. Click on the query ID to see details of the execution plan that Presto produced. You can get more information about the query by clicking on any of the tabs that are on this screen. For instance, the Live Plan tab will show a visual explain of the stages that the query went through during execution. Scrolling to the bottom of this screen will also display any error messages that may have been produced by the SQL. Take time to check out the other information that is available for the query including the stage performance.","title":"Using the Presto console UI"},{"location":"wxd-prestocli/","text":"Watsonx.data Introduction Watsonx.data is based on open source PrestoDB, a distributed query engine that enables querying data stored in open file formats using open table formats for optimization or performance. Some of the characteristics which you will learn and see in action include: Compute processing is performed in memory and in parallel. Data is pipelined between query stages and over the network reducing latency overhead that one would have if disk I/O were involved. All the below tasks will be done using the Developer edition of watsonx.data. Using watsonx.data Connectivity to watsonx.data can be done using the following methods: Command line interface (CLI) JDBC drivers watsonx.data UI Connecting to watsonx.data and executing queries using CLI Open the watsonx.data CLI using the development directory. Make sure you are the root user. whoami If not, switch to the root user. sudo su - Change to the development directory. cd /root/ibm-lh-dev/bin Start the Presto CLI. ./presto-cli We are going to inspect the available catalogs in the watsonx.data system. A watsonx.data catalog contains schemas and references a data source via a connector. A connector is like a driver for a database. Watsonx.data connectors are an implementation of Presto\u2019s SPI which allows Presto to interact with a resource. There are several built-in connectors for JMX, Hive, TPCH etc., some of which you will use as part of the labs. Display the catalogs. show catalogs; Catalog --------------- hive_data iceberg_data jmx system tpcds tpch (6 rows) Let's look up what schemas are available with any given catalog. We will use the TPCH catalog which is an internal PrestoDB auto-generated catalog and look at the available schemas. show schemas in tpch; Schema -------------------- information_schema sf1 sf100 sf1000 sf10000 sf100000 sf300 sf3000 sf30000 tiny (10 rows) Quit the presto-cli interface by executing the \"quit;\" command. quit; You can connect to a specific catalog and schema and look at the tables etc. ./presto-cli --catalog tpch --schema tiny presto:tiny> You will notice that the Presto prompt includes the name of the schema we are currently connected to. Look at the available tables in the TPCH catalog under the tiny schema. show tables; Table ---------- customer lineitem nation orders part partsupp region supplier (8 rows) Inspect schema of the customer table. describe customer; Column | Type | Extra | Comment ------------+--------------+-------+--------- custkey | bigint | | name | varchar(25) | | address | varchar(40) | | nationkey | bigint | | phone | varchar(15) | | acctbal | double | | mktsegment | varchar(10) | | comment | varchar(117) | | (8 rows) You could also use the syntax below to achieve the same result. show columns from customer; Column | Type | Extra | Comment -----------+--------------+-------+--------- custkey | bigint | | name | varchar(25) | | address | varchar(40) | | nationkey | bigint | | phone | varchar(15) | | acctbal | double | | mktsegment | varchar(10) | | comment | varchar(117) | | (8 rows) Inspect available functions. show functions like 'date%'; Function | Return Type | Argument Types | Function Type | Deterministic | Description | Variable Arity | Built In | Temporary | Language -------------+--------------------------+----------------------------------------------------------------+---------------+---------------+-------------------------------------------------------------+----------------+----------+-----------+---------- date | date | timestamp | scalar | true | | false | true | false | date | date | timestamp with time zone | scalar | true | | false | true | false | date | date | varchar(x) | scalar | true | | false | true | false | date_add | date | varchar(x), bigint, date | scalar | true | add the specified amount of date to the given date | false | true | false | date_add | time | varchar(x), bigint, time | scalar | true | add the specified amount of time to the given time | false | true | false | date_add | time with time zone | varchar(x), bigint, time with time zone | scalar | true | add the specified amount of time to the given time | false | true | false | date_add | timestamp | varchar(x), bigint, timestamp | scalar | true | add the specified amount of time to the given timestamp | false | true | false | date_add | timestamp with time zone | varchar(x), bigint, timestamp with time zone | scalar | true | add the specified amount of time to the given timestamp | false | true | false | date_diff | bigint | varchar(x), date, date | scalar | true | difference of the given dates in the given unit | false | true | false | date_diff | bigint | varchar(x), time with time zone, time with time zone | scalar | true | difference of the given times in the given unit | false | true | false | date_diff | bigint | varchar(x), time, time | scalar | true | difference of the given times in the given unit | false | true | false | date_diff | bigint | varchar(x), timestamp with time zone, timestamp with time zone | scalar | true | difference of the given times in the given unit | false | true | false | date_diff | bigint | varchar(x), timestamp, timestamp | scalar | true | difference of the given times in the given unit | false | true | false | date_format | varchar | timestamp with time zone, varchar(x) | scalar | true | | false | true | false | date_format | varchar | timestamp, varchar(x) | scalar | true | | false | true | false | date_parse | timestamp | varchar(x), varchar(y) | scalar | true | | false | true | false | date_trunc | date | varchar(x), date | scalar | true | truncate to the specified precision in the session timezone | false | true | false | date_trunc | time | varchar(x), time | scalar | true | truncate to the specified precision in the session timezone | false | true | false | date_trunc | time with time zone | varchar(x), time with time zone | scalar | true | truncate to the specified precision | false | true | false | date_trunc | timestamp | varchar(x), timestamp | scalar | true | truncate to the specified precision in the session timezone | false | true | false | date_trunc | timestamp with time zone | varchar(x), timestamp with time zone | scalar | true | truncate to the specified precision | false | true | false | (21 rows) Switch to a different schema. use sf1; Display the Tables in the schema. show tables; Table ---------- customer lineitem nation orders part partsupp region supplier (8 rows) Query data from customer table. select * from customer limit 5; custkey | name | address | nationkey | phone | acctbal | mktsegment | comment ---------+--------------------+------------------------------------------+-----------+-----------------+---------+------------+------------------------------------------------------------------------------------------------------- 37501 | Customer#000037501 | Ftb6T5ImHuJ | 2 | 12-397-688-6719 | -324.85 | HOUSEHOLD | pending ideas use carefully. express, ironic platelets use among the furiously regular instructions. 37502 | Customer#000037502 | ppCVXCFV,4JJ97IibbcMB5,aPByjYL07vmOLO 3m | 18 | 28-515-931-4624 | 5179.2 | BUILDING | express deposits. pending, regular deposits wake furiously bold deposits. regular 37503 | Customer#000037503 | Cg60cN3LGIUpLpXn0vRffQl8 | 13 | 23-977-571-7365 | 1862.32 | BUILDING | ular deposits. furiously ironic deposits integrate carefully among the iron 37504 | Customer#000037504 | E1 IiMlCfW7I4 1b9wfDZR | 21 | 31-460-590-3623 | 2955.33 | HOUSEHOLD | s believe slyly final foxes. furiously e 37505 | Customer#000037505 | Ad,XVdA6XAa0h aukZHUo5Mxh,ZRwVR3k7b7 | 3 | 13-521-760-7263 | 3243.15 | FURNITURE | ites according to the quickly bold instru (5 rows) Gather statistics on a given table. show stats for customer; column_name | data_size | distinct_values_count | nulls_fraction | row_count | low_value | high_value -------------+-------------+-----------------------+----------------+-----------+-----------+------------ custkey | NULL | 150039.0 | 0.0 | NULL | 1 | 150000 name | 2700000.0 | 149980.0 | 0.0 | NULL | NULL | NULL address | 3758056.0 | 150043.0 | 0.0 | NULL | NULL | NULL nationkey | NULL | 25.0 | 0.0 | NULL | 0 | 24 phone | 2250000.0 | 150018.0 | 0.0 | NULL | NULL | NULL acctbal | NULL | 140166.0 | 0.0 | NULL | -999.99 | 9999.99 mktsegment | 1349610.0 | 5.0 | 0.0 | NULL | NULL | NULL comment | 1.0876099E7 | 149987.0 | 0.0 | NULL | NULL | NULL NULL | NULL | NULL | NULL | 150000.0 | NULL | NULL (9 rows) Quit Presto. quit;","title":"Presto CLI"},{"location":"wxd-prestocli/#watsonxdata-introduction","text":"Watsonx.data is based on open source PrestoDB, a distributed query engine that enables querying data stored in open file formats using open table formats for optimization or performance. Some of the characteristics which you will learn and see in action include: Compute processing is performed in memory and in parallel. Data is pipelined between query stages and over the network reducing latency overhead that one would have if disk I/O were involved. All the below tasks will be done using the Developer edition of watsonx.data.","title":"Watsonx.data Introduction"},{"location":"wxd-prestocli/#using-watsonxdata","text":"Connectivity to watsonx.data can be done using the following methods: Command line interface (CLI) JDBC drivers watsonx.data UI","title":"Using watsonx.data"},{"location":"wxd-prestocli/#connecting-to-watsonxdata-and-executing-queries-using-cli","text":"Open the watsonx.data CLI using the development directory. Make sure you are the root user. whoami If not, switch to the root user. sudo su - Change to the development directory. cd /root/ibm-lh-dev/bin Start the Presto CLI. ./presto-cli We are going to inspect the available catalogs in the watsonx.data system. A watsonx.data catalog contains schemas and references a data source via a connector. A connector is like a driver for a database. Watsonx.data connectors are an implementation of Presto\u2019s SPI which allows Presto to interact with a resource. There are several built-in connectors for JMX, Hive, TPCH etc., some of which you will use as part of the labs. Display the catalogs. show catalogs; Catalog --------------- hive_data iceberg_data jmx system tpcds tpch (6 rows) Let's look up what schemas are available with any given catalog. We will use the TPCH catalog which is an internal PrestoDB auto-generated catalog and look at the available schemas. show schemas in tpch; Schema -------------------- information_schema sf1 sf100 sf1000 sf10000 sf100000 sf300 sf3000 sf30000 tiny (10 rows) Quit the presto-cli interface by executing the \"quit;\" command. quit; You can connect to a specific catalog and schema and look at the tables etc. ./presto-cli --catalog tpch --schema tiny presto:tiny> You will notice that the Presto prompt includes the name of the schema we are currently connected to. Look at the available tables in the TPCH catalog under the tiny schema. show tables; Table ---------- customer lineitem nation orders part partsupp region supplier (8 rows) Inspect schema of the customer table. describe customer; Column | Type | Extra | Comment ------------+--------------+-------+--------- custkey | bigint | | name | varchar(25) | | address | varchar(40) | | nationkey | bigint | | phone | varchar(15) | | acctbal | double | | mktsegment | varchar(10) | | comment | varchar(117) | | (8 rows) You could also use the syntax below to achieve the same result. show columns from customer; Column | Type | Extra | Comment -----------+--------------+-------+--------- custkey | bigint | | name | varchar(25) | | address | varchar(40) | | nationkey | bigint | | phone | varchar(15) | | acctbal | double | | mktsegment | varchar(10) | | comment | varchar(117) | | (8 rows) Inspect available functions. show functions like 'date%'; Function | Return Type | Argument Types | Function Type | Deterministic | Description | Variable Arity | Built In | Temporary | Language -------------+--------------------------+----------------------------------------------------------------+---------------+---------------+-------------------------------------------------------------+----------------+----------+-----------+---------- date | date | timestamp | scalar | true | | false | true | false | date | date | timestamp with time zone | scalar | true | | false | true | false | date | date | varchar(x) | scalar | true | | false | true | false | date_add | date | varchar(x), bigint, date | scalar | true | add the specified amount of date to the given date | false | true | false | date_add | time | varchar(x), bigint, time | scalar | true | add the specified amount of time to the given time | false | true | false | date_add | time with time zone | varchar(x), bigint, time with time zone | scalar | true | add the specified amount of time to the given time | false | true | false | date_add | timestamp | varchar(x), bigint, timestamp | scalar | true | add the specified amount of time to the given timestamp | false | true | false | date_add | timestamp with time zone | varchar(x), bigint, timestamp with time zone | scalar | true | add the specified amount of time to the given timestamp | false | true | false | date_diff | bigint | varchar(x), date, date | scalar | true | difference of the given dates in the given unit | false | true | false | date_diff | bigint | varchar(x), time with time zone, time with time zone | scalar | true | difference of the given times in the given unit | false | true | false | date_diff | bigint | varchar(x), time, time | scalar | true | difference of the given times in the given unit | false | true | false | date_diff | bigint | varchar(x), timestamp with time zone, timestamp with time zone | scalar | true | difference of the given times in the given unit | false | true | false | date_diff | bigint | varchar(x), timestamp, timestamp | scalar | true | difference of the given times in the given unit | false | true | false | date_format | varchar | timestamp with time zone, varchar(x) | scalar | true | | false | true | false | date_format | varchar | timestamp, varchar(x) | scalar | true | | false | true | false | date_parse | timestamp | varchar(x), varchar(y) | scalar | true | | false | true | false | date_trunc | date | varchar(x), date | scalar | true | truncate to the specified precision in the session timezone | false | true | false | date_trunc | time | varchar(x), time | scalar | true | truncate to the specified precision in the session timezone | false | true | false | date_trunc | time with time zone | varchar(x), time with time zone | scalar | true | truncate to the specified precision | false | true | false | date_trunc | timestamp | varchar(x), timestamp | scalar | true | truncate to the specified precision in the session timezone | false | true | false | date_trunc | timestamp with time zone | varchar(x), timestamp with time zone | scalar | true | truncate to the specified precision | false | true | false | (21 rows) Switch to a different schema. use sf1; Display the Tables in the schema. show tables; Table ---------- customer lineitem nation orders part partsupp region supplier (8 rows) Query data from customer table. select * from customer limit 5; custkey | name | address | nationkey | phone | acctbal | mktsegment | comment ---------+--------------------+------------------------------------------+-----------+-----------------+---------+------------+------------------------------------------------------------------------------------------------------- 37501 | Customer#000037501 | Ftb6T5ImHuJ | 2 | 12-397-688-6719 | -324.85 | HOUSEHOLD | pending ideas use carefully. express, ironic platelets use among the furiously regular instructions. 37502 | Customer#000037502 | ppCVXCFV,4JJ97IibbcMB5,aPByjYL07vmOLO 3m | 18 | 28-515-931-4624 | 5179.2 | BUILDING | express deposits. pending, regular deposits wake furiously bold deposits. regular 37503 | Customer#000037503 | Cg60cN3LGIUpLpXn0vRffQl8 | 13 | 23-977-571-7365 | 1862.32 | BUILDING | ular deposits. furiously ironic deposits integrate carefully among the iron 37504 | Customer#000037504 | E1 IiMlCfW7I4 1b9wfDZR | 21 | 31-460-590-3623 | 2955.33 | HOUSEHOLD | s believe slyly final foxes. furiously e 37505 | Customer#000037505 | Ad,XVdA6XAa0h aukZHUo5Mxh,ZRwVR3k7b7 | 3 | 13-521-760-7263 | 3243.15 | FURNITURE | ites according to the quickly bold instru (5 rows) Gather statistics on a given table. show stats for customer; column_name | data_size | distinct_values_count | nulls_fraction | row_count | low_value | high_value -------------+-------------+-----------------------+----------------+-----------+-----------+------------ custkey | NULL | 150039.0 | 0.0 | NULL | 1 | 150000 name | 2700000.0 | 149980.0 | 0.0 | NULL | NULL | NULL address | 3758056.0 | 150043.0 | 0.0 | NULL | NULL | NULL nationkey | NULL | 25.0 | 0.0 | NULL | 0 | 24 phone | 2250000.0 | 150018.0 | 0.0 | NULL | NULL | NULL acctbal | NULL | 140166.0 | 0.0 | NULL | -999.99 | 9999.99 mktsegment | 1349610.0 | 5.0 | 0.0 | NULL | NULL | NULL comment | 1.0876099E7 | 149987.0 | 0.0 | NULL | NULL | NULL NULL | NULL | NULL | NULL | 150000.0 | NULL | NULL (9 rows) Quit Presto. quit;","title":"Connecting to watsonx.data and executing queries using CLI"},{"location":"wxd-quick/","text":"Quick Start The following sections describe how to get started quickly with the watsonx.data developer system. If you are not familiar with the tools mentioned below, select the details link for more instructions. Requesting an IBM userid Requesting a TechZone image Accessing the Image SSH Access Open Ports Passwords Portainer Console Documentation IBM Userid An IBMid is needed to access IBM Technology Zone. If you do not have an IBMid, click on the following link and request a new IBMid. https://techzone.ibm.com More details: Creating an IBM Userid Requesting a TechZone image Log into TechZone ( https://techzone.ibm.com ) and search for the watsonx.data Developer Base Image or use the following link. https://techzone.ibm.com/collection/ibm-watsonxdata-developer-base-image Problem with reservations failing? Check the TechZone status page at https://techzone.status.io . More details: Reserving a TechZone image Accessing the Image The email from TechZone indicating that the image is ready will contain a link to your reservations. Click on the link and search for the watsonx.data reservation. More details: Accessing a TechZone image SSH Access Your TechZone reservation will include the server name and port number to use when connecting using ssh. The port number is referred to as port in the command below, while the server will be referred to as region.techzone-server.com . Replace these values with those found in your reservation. Open a terminal window and use the following syntax to connect as the watsonx userid. ssh -p port watsonx@region.techzone-server.com The port number and server name are provided as part of the TechZone reservation details. To become the root user, issue the following command. sudo su - Password for both users is watsonx.data . You can copy files into and out of the server using the following syntax: scp -P port myfile.txt watsonx@region.techzone-server.com:/tmp/myfile.txt scp -P port watsonx@region.techzone-server.com:/tmp/myfile.txt myfile.txt More details: SSH Access Open Ports The following URLs and Ports are used to access the watsonx.data services. Most browsers will work with these URLs. However, Mac OSX users should be aware that accessing the MinIO console may occasionally cause Firefox and Chrome to lock up. If you find that this occurs, try using Safari which appears to work fine. The ports that are used in the lab are listed below. Note that the internal port number is always the same when running in the VMware image using the VM Remote Console. When using your workstation's browser, you will need to use the server name and port number supplied in the TechZone reservation. Service Port Active watsonx.data management console 9443 Yes Presto console 8443 Yes MinIO console (S3 buckets) 9001 Yes MinIO S3 Endpoint 9000 Yes Portainer (Docker container management) 6443 Yes Apache Superset (Query and Graphing) 8088 No Jupyter Notebook 8888 Yes Presto External Port 8443 Yes Hive metadata Port 9043 Yes MySQL External Port 3306 Yes Postgres External Port 5432 Yes Db2 Database Port 50000 Yes VNC Port 5901 No Note : The following ports are not active unless the service is started: Apache Superset (8088) VNC Terminal Display (5901) More details: Open Ports Passwords This table lists the passwords for the services that have \"fixed\" userids and passwords. Service Userid Password Virtual Machine watsonx watsonx.data Virtual Machine root watsonx.data watsonx.data UI ibmlhadmin password Jupyter Notebook none watsonx.data Presto ibmlhadmin password Minio Generated Generated Postgres admin Generated Apache Superset admin admin Portainer admin watsonx.data Db2 db2inst1 db2inst1 MySQL root password VNC Windows none watsonx. VNC OSX none watsonx.data Use the following commands to get the generated userid and password for MinIO. export LH_S3_ACCESS_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_ACCESS_KEY | sed \"s/.*=//\") export LH_S3_SECRET_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_SECRET_KEY | sed 's/.*=//') echo \"MinIO Userid : \" $LH_S3_ACCESS_KEY echo \"MinIO Password: \" $LH_S3_SECRET_KEY Use the following command to get the password for Postgres. export POSTGRES_PASSWORD=$(docker exec ibm-lh-postgres printenv | grep POSTGRES_PASSWORD | sed 's/.*=//') echo \"Postgres Userid : admin\" echo \"Postgres Password : \" $POSTGRES_PASSWORD You can get all passwords for the system when you are logged by issuing the following command: cat /certs/passwords If the passwords do not appear to work, you may need to regenerate them. The following must be run as the root user. sudo su - passwords The passwords command will refresh the passwords and also display them. If this command is not run as root, an error message will be displayed because the password file cannot be updated as the watsonx user. More details: Passwords Portainer This lab system has Portainer installed. Portainer provides an administrative interface to the Docker images that are running on this system. You can use this console to check that all the containers are running and see what resources they are using. Open your TechZone reservation and select the Portainer link to connect to it. Credentials: userid: admin password: watsonx.data More details: Portainer Documentation The following links provide more information on the components in this lab. watsonx.data - https://www.ibm.com/docs/en/watsonxdata/1.0.x Presto SQL - https://prestodb.io/docs/current/sql.html Presto Console - https://prestodb.io/docs/current/admin/web-interface.html MinIO - https://min.io/docs/minio/linux/administration/minio-console.html Apache Superset - https://superset.apache.org/docs/creating-charts-dashboards/exploring-data dBeaver - https://dbeaver.com/docs/wiki/Application-Window-Overview/ Db2 SQL - https://www.ibm.com/docs/en/db2/11.5?topic=queries-select-statement PostgreSQL SQL - https://www.postgresql.org/docs/current/sql.html","title":"Quick Start"},{"location":"wxd-quick/#quick-start","text":"The following sections describe how to get started quickly with the watsonx.data developer system. If you are not familiar with the tools mentioned below, select the details link for more instructions. Requesting an IBM userid Requesting a TechZone image Accessing the Image SSH Access Open Ports Passwords Portainer Console Documentation","title":"Quick Start"},{"location":"wxd-quick/#ibm-userid","text":"An IBMid is needed to access IBM Technology Zone. If you do not have an IBMid, click on the following link and request a new IBMid. https://techzone.ibm.com More details: Creating an IBM Userid","title":"IBM Userid"},{"location":"wxd-quick/#requesting-a-techzone-image","text":"Log into TechZone ( https://techzone.ibm.com ) and search for the watsonx.data Developer Base Image or use the following link. https://techzone.ibm.com/collection/ibm-watsonxdata-developer-base-image Problem with reservations failing? Check the TechZone status page at https://techzone.status.io . More details: Reserving a TechZone image","title":"Requesting a TechZone image"},{"location":"wxd-quick/#accessing-the-image","text":"The email from TechZone indicating that the image is ready will contain a link to your reservations. Click on the link and search for the watsonx.data reservation. More details: Accessing a TechZone image","title":"Accessing the Image"},{"location":"wxd-quick/#ssh-access","text":"Your TechZone reservation will include the server name and port number to use when connecting using ssh. The port number is referred to as port in the command below, while the server will be referred to as region.techzone-server.com . Replace these values with those found in your reservation. Open a terminal window and use the following syntax to connect as the watsonx userid. ssh -p port watsonx@region.techzone-server.com The port number and server name are provided as part of the TechZone reservation details. To become the root user, issue the following command. sudo su - Password for both users is watsonx.data . You can copy files into and out of the server using the following syntax: scp -P port myfile.txt watsonx@region.techzone-server.com:/tmp/myfile.txt scp -P port watsonx@region.techzone-server.com:/tmp/myfile.txt myfile.txt More details: SSH Access","title":"SSH Access"},{"location":"wxd-quick/#open-ports","text":"The following URLs and Ports are used to access the watsonx.data services. Most browsers will work with these URLs. However, Mac OSX users should be aware that accessing the MinIO console may occasionally cause Firefox and Chrome to lock up. If you find that this occurs, try using Safari which appears to work fine. The ports that are used in the lab are listed below. Note that the internal port number is always the same when running in the VMware image using the VM Remote Console. When using your workstation's browser, you will need to use the server name and port number supplied in the TechZone reservation. Service Port Active watsonx.data management console 9443 Yes Presto console 8443 Yes MinIO console (S3 buckets) 9001 Yes MinIO S3 Endpoint 9000 Yes Portainer (Docker container management) 6443 Yes Apache Superset (Query and Graphing) 8088 No Jupyter Notebook 8888 Yes Presto External Port 8443 Yes Hive metadata Port 9043 Yes MySQL External Port 3306 Yes Postgres External Port 5432 Yes Db2 Database Port 50000 Yes VNC Port 5901 No Note : The following ports are not active unless the service is started: Apache Superset (8088) VNC Terminal Display (5901) More details: Open Ports","title":"Open Ports"},{"location":"wxd-quick/#passwords","text":"This table lists the passwords for the services that have \"fixed\" userids and passwords. Service Userid Password Virtual Machine watsonx watsonx.data Virtual Machine root watsonx.data watsonx.data UI ibmlhadmin password Jupyter Notebook none watsonx.data Presto ibmlhadmin password Minio Generated Generated Postgres admin Generated Apache Superset admin admin Portainer admin watsonx.data Db2 db2inst1 db2inst1 MySQL root password VNC Windows none watsonx. VNC OSX none watsonx.data Use the following commands to get the generated userid and password for MinIO. export LH_S3_ACCESS_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_ACCESS_KEY | sed \"s/.*=//\") export LH_S3_SECRET_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_SECRET_KEY | sed 's/.*=//') echo \"MinIO Userid : \" $LH_S3_ACCESS_KEY echo \"MinIO Password: \" $LH_S3_SECRET_KEY Use the following command to get the password for Postgres. export POSTGRES_PASSWORD=$(docker exec ibm-lh-postgres printenv | grep POSTGRES_PASSWORD | sed 's/.*=//') echo \"Postgres Userid : admin\" echo \"Postgres Password : \" $POSTGRES_PASSWORD You can get all passwords for the system when you are logged by issuing the following command: cat /certs/passwords If the passwords do not appear to work, you may need to regenerate them. The following must be run as the root user. sudo su - passwords The passwords command will refresh the passwords and also display them. If this command is not run as root, an error message will be displayed because the password file cannot be updated as the watsonx user. More details: Passwords","title":"Passwords"},{"location":"wxd-quick/#portainer","text":"This lab system has Portainer installed. Portainer provides an administrative interface to the Docker images that are running on this system. You can use this console to check that all the containers are running and see what resources they are using. Open your TechZone reservation and select the Portainer link to connect to it. Credentials: userid: admin password: watsonx.data More details: Portainer","title":"Portainer"},{"location":"wxd-quick/#documentation","text":"The following links provide more information on the components in this lab. watsonx.data - https://www.ibm.com/docs/en/watsonxdata/1.0.x Presto SQL - https://prestodb.io/docs/current/sql.html Presto Console - https://prestodb.io/docs/current/admin/web-interface.html MinIO - https://min.io/docs/minio/linux/administration/minio-console.html Apache Superset - https://superset.apache.org/docs/creating-charts-dashboards/exploring-data dBeaver - https://dbeaver.com/docs/wiki/Application-Window-Overview/ Db2 SQL - https://www.ibm.com/docs/en/db2/11.5?topic=queries-select-statement PostgreSQL SQL - https://www.postgresql.org/docs/current/sql.html","title":"Documentation"},{"location":"wxd-reference-access/","text":"Accessing the watsonx.data TechZone Image The reservation email from TechZone is extremely important since it provides a link to your reservation. Click on the View My Reservations to access your reservations. Click on the reservation that corresponds to the watsonx.data reservation. The menu button that is beside the arrow provides options to extend or delete the reservation. When you click on reservation details option, or the reservation box, the browser will display the details of your image. Scroll down to the bottom of the web page to access the VM Remote Console. You can access the logon screen of the virtual machine by pressing the VM Remote Console button. It is not necessary to use the VM console except unless you want to use the dBeaver program. Select the watsonx user and use watsonx.data as the password. Refer to the section on VM Remote Console for more details.","title":"Accessing the reservation"},{"location":"wxd-reference-access/#accessing-the-watsonxdata-techzone-image","text":"The reservation email from TechZone is extremely important since it provides a link to your reservation. Click on the View My Reservations to access your reservations. Click on the reservation that corresponds to the watsonx.data reservation. The menu button that is beside the arrow provides options to extend or delete the reservation. When you click on reservation details option, or the reservation box, the browser will display the details of your image. Scroll down to the bottom of the web page to access the VM Remote Console. You can access the logon screen of the virtual machine by pressing the VM Remote Console button. It is not necessary to use the VM console except unless you want to use the dBeaver program. Select the watsonx user and use watsonx.data as the password. Refer to the section on VM Remote Console for more details.","title":"Accessing the watsonx.data TechZone Image"},{"location":"wxd-reference-console/","text":"Using the VM Remote Console The watsonx server that has been provisioned has no physical monitor attached to it (headless is what it is commonly referred to) and so we need to use a different technique to view the desktop of the main user or the system (watsonx). The first thing to consider is whether you need to use the VM Remote Console at all. All the services like the watsonx.data UI, MinIO, Presto, Apache Superset and Portainer, are all web-based servers which means you just need to use your own browser to access these programs. Connecting into the watsonx virtual machine can be done using the secure shell command (ssh) which provides access to all the low-level commands you might need to use like starting the Apache Superset service. Note that Apache Superset is not up and running by default, so you will need to start it before attempting to connect to it. So what's the VM Remote Console required for? One program that has been provided to view the database schemas is dBeaver, a community edition of software that provides a query interface to 100's of data sources, including the watsonx.data environment. You can only use this program using the VM Remote Console. You do have the option of installing this software on your own machine if you wish. Find your email message that contains details of your reservation. Details of what the reservations and the page containing details of the reservation can be found in the Accessing the reservation or Accessing a workshop section. Once the details appear, scroll down to the bottom of the web page, and you will see the VM Remote Console button. You can access the logon screen of the virtual machine by pressing the VM Remote Console button. Clicking on this button will display the logon screen for the server. Select the watsonx user and use watsonx.data as the password. You can open this window in a separate browser window, or place it into full-screen mode. Note that you may need to increase the size of your browser window (or change the scaling in the browser) to see all the virtual desktop. At this point you have access to the desktop of the watsonx user and can issue commands from within this environment. As mentioned previously, you do not need to use this interface to use the lab.","title":"VM Remote Console"},{"location":"wxd-reference-console/#using-the-vm-remote-console","text":"The watsonx server that has been provisioned has no physical monitor attached to it (headless is what it is commonly referred to) and so we need to use a different technique to view the desktop of the main user or the system (watsonx). The first thing to consider is whether you need to use the VM Remote Console at all. All the services like the watsonx.data UI, MinIO, Presto, Apache Superset and Portainer, are all web-based servers which means you just need to use your own browser to access these programs. Connecting into the watsonx virtual machine can be done using the secure shell command (ssh) which provides access to all the low-level commands you might need to use like starting the Apache Superset service. Note that Apache Superset is not up and running by default, so you will need to start it before attempting to connect to it. So what's the VM Remote Console required for? One program that has been provided to view the database schemas is dBeaver, a community edition of software that provides a query interface to 100's of data sources, including the watsonx.data environment. You can only use this program using the VM Remote Console. You do have the option of installing this software on your own machine if you wish. Find your email message that contains details of your reservation. Details of what the reservations and the page containing details of the reservation can be found in the Accessing the reservation or Accessing a workshop section. Once the details appear, scroll down to the bottom of the web page, and you will see the VM Remote Console button. You can access the logon screen of the virtual machine by pressing the VM Remote Console button. Clicking on this button will display the logon screen for the server. Select the watsonx user and use watsonx.data as the password. You can open this window in a separate browser window, or place it into full-screen mode. Note that you may need to increase the size of your browser window (or change the scaling in the browser) to see all the virtual desktop. At this point you have access to the desktop of the watsonx user and can issue commands from within this environment. As mentioned previously, you do not need to use this interface to use the lab.","title":"Using the VM Remote Console"},{"location":"wxd-reference-documentation/","text":"Documentation The following links provide more information on the components in this lab. watsonx.data - https://www.ibm.com/docs/en/watsonxdata/1.0.x Presto SQL - https://prestodb.io/docs/current/sql.html Presto Console - https://prestodb.io/docs/current/admin/web-interface.html MinIO - https://min.io/docs/minio/linux/administration/minio-console.html MinIO CLI - https://min.io/docs/minio/linux/reference/minio-mc.html Apache Superset - https://superset.apache.org/docs/creating-charts-dashboards/exploring-data dBeaver - https://dbeaver.com/docs/wiki/Application-Window-Overview/ Db2 SQL - https://www.ibm.com/docs/en/db2/11.5?topic=queries-select-statement PostgreSQL SQL - https://www.postgresql.org/docs/current/sql.html MySQL SQL - https://dev.mysql.com/doc/refman/8.1/en/sql-statements.html","title":"Documentation"},{"location":"wxd-reference-documentation/#documentation","text":"The following links provide more information on the components in this lab. watsonx.data - https://www.ibm.com/docs/en/watsonxdata/1.0.x Presto SQL - https://prestodb.io/docs/current/sql.html Presto Console - https://prestodb.io/docs/current/admin/web-interface.html MinIO - https://min.io/docs/minio/linux/administration/minio-console.html MinIO CLI - https://min.io/docs/minio/linux/reference/minio-mc.html Apache Superset - https://superset.apache.org/docs/creating-charts-dashboards/exploring-data dBeaver - https://dbeaver.com/docs/wiki/Application-Window-Overview/ Db2 SQL - https://www.ibm.com/docs/en/db2/11.5?topic=queries-select-statement PostgreSQL SQL - https://www.postgresql.org/docs/current/sql.html MySQL SQL - https://dev.mysql.com/doc/refman/8.1/en/sql-statements.html","title":"Documentation"},{"location":"wxd-reference-ibmid/","text":"Requesting an IBM Userid. An IBMid is needed to access IBM Technology Zone. If you do not have an IBMid, click on the following link. https://techzone.ibm.com You should see the following login screen for TechZone. Click on the `Create an IBMid`` button and proceed to fill in the details on this form: Once you have verified your account, you can continue onto logging into the TechZone server.","title":"Requesting an IBMid"},{"location":"wxd-reference-ibmid/#requesting-an-ibm-userid","text":"An IBMid is needed to access IBM Technology Zone. If you do not have an IBMid, click on the following link. https://techzone.ibm.com You should see the following login screen for TechZone. Click on the `Create an IBMid`` button and proceed to fill in the details on this form: Once you have verified your account, you can continue onto logging into the TechZone server.","title":"Requesting an IBM Userid."},{"location":"wxd-reference-passwords/","text":"Passwords This table lists the passwords for the services that have \"fixed\" userids and passwords. Service Userid Password Virtual Machine watsonx watsonx.data Virtual Machine root watsonx.data watsonx.data UI ibmlhadmin password Jupyter Notebook none watsonx.data Presto ibmlhadmin password Minio Generated Generated Postgres admin Generated Apache Superset admin admin Portainer admin watsonx.data Db2 db2inst1 db2inst1 MySQL root password VNC Windows none watsonx. VNC OSX none watsonx.data Use the following commands to get the generated userid and password for MinIO. export LH_S3_ACCESS_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_ACCESS_KEY | sed 's/.*=//') export LH_S3_SECRET_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_SECRET_KEY | sed 's/.*=//') echo \"MinIO Userid : \" $LH_S3_ACCESS_KEY echo \"MinIO Password: \" $LH_S3_SECRET_KEY Use the following command to get the password for Postgres. export POSTGRES_PASSWORD=$(docker exec ibm-lh-postgres printenv | grep POSTGRES_PASSWORD | sed 's/.*=//') echo \"Postgres Userid : admin\" echo \"Postgres Password : \" $POSTGRES_PASSWORD You can get all passwords for the system when you are logged by issuing the following command: cat /certs/passwords If the passwords do not appear to work, you may need to regenerate them. The following must be run as the root user. sudo su - passwords The passwords command will refresh the passwords and also display them. If this command is not run as root, an error message will be displayed because the password file cannot be updated as the watsonx user.","title":"Userids and Passwords"},{"location":"wxd-reference-passwords/#passwords","text":"This table lists the passwords for the services that have \"fixed\" userids and passwords. Service Userid Password Virtual Machine watsonx watsonx.data Virtual Machine root watsonx.data watsonx.data UI ibmlhadmin password Jupyter Notebook none watsonx.data Presto ibmlhadmin password Minio Generated Generated Postgres admin Generated Apache Superset admin admin Portainer admin watsonx.data Db2 db2inst1 db2inst1 MySQL root password VNC Windows none watsonx. VNC OSX none watsonx.data Use the following commands to get the generated userid and password for MinIO. export LH_S3_ACCESS_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_ACCESS_KEY | sed 's/.*=//') export LH_S3_SECRET_KEY=$(docker exec ibm-lh-presto printenv | grep LH_S3_SECRET_KEY | sed 's/.*=//') echo \"MinIO Userid : \" $LH_S3_ACCESS_KEY echo \"MinIO Password: \" $LH_S3_SECRET_KEY Use the following command to get the password for Postgres. export POSTGRES_PASSWORD=$(docker exec ibm-lh-postgres printenv | grep POSTGRES_PASSWORD | sed 's/.*=//') echo \"Postgres Userid : admin\" echo \"Postgres Password : \" $POSTGRES_PASSWORD You can get all passwords for the system when you are logged by issuing the following command: cat /certs/passwords If the passwords do not appear to work, you may need to regenerate them. The following must be run as the root user. sudo su - passwords The passwords command will refresh the passwords and also display them. If this command is not run as root, an error message will be displayed because the password file cannot be updated as the watsonx user.","title":"Passwords"},{"location":"wxd-reference-portainer/","text":"Portainer This lab system has Portainer installed. Portainer provides an administrative interface to the Docker images that are running on this system. You can use this console to check that all the containers are running and see what resources they are using. Your TechZone reservation will include the server name and port number to use when connecting to Portainer. Open your browser and navigate to: Portainer console - https://region.techzone-server.com:port Credentials: userid: admin password: watsonx.data Once you have logged in, you should select \"Get Started\". The next screen displays the main control panel for Portainer. Select the Local server. This screen provides details on the containers, images, volumes, and networks that make up your docker installation. To view the containers that are running, select the container icon. From within this view, you can view the details of any container, including the environment settings, the current logs, and allow you to shell into the environment. For more details on Portainer, see the Portainer documentation .","title":"Portainer Console"},{"location":"wxd-reference-portainer/#portainer","text":"This lab system has Portainer installed. Portainer provides an administrative interface to the Docker images that are running on this system. You can use this console to check that all the containers are running and see what resources they are using. Your TechZone reservation will include the server name and port number to use when connecting to Portainer. Open your browser and navigate to: Portainer console - https://region.techzone-server.com:port Credentials: userid: admin password: watsonx.data Once you have logged in, you should select \"Get Started\". The next screen displays the main control panel for Portainer. Select the Local server. This screen provides details on the containers, images, volumes, and networks that make up your docker installation. To view the containers that are running, select the container icon. From within this view, you can view the details of any container, including the environment settings, the current logs, and allow you to shell into the environment. For more details on Portainer, see the Portainer documentation .","title":"Portainer"},{"location":"wxd-reference-ports/","text":"Watsonx.data Ports The top of your TechZone reservation will contain a list of published services at the top of the details page. These URLs and Port numbers are needed to access the watsonx.data services. The list will contain the following information: SSH for watsonx userid - ssh -p 20200 watsonx@region.techzone-services.com MySQL Port - Server: region.techzone-services.com Port: 21409 PostgreSQL Port - Server: region.techzone-services.com Port: 38052 VNC Service - vnc://region.techzone-services.com:38725 Portainer console - https://region.techzone-services.com:44449 Apache Superset - http://region.techzone-services.com:41471 Presto console - https://region.techzone-services.com:49618 Presto Port - Server: region.techzone-services.com Port: 49618 Jupyter Notebook - Server: http://region.techzone-services.com:25490/notebooks/Table_of_Contents.ipynb Minio Endpoint - Server: region.techzone-services.com Port: 29652 Minio console - http://region.techzone-services.com:45050 Hive Thrift URL - thrift://region.techzone-services.com:22211 Watsonx UI - https://region.techzone-services.com:37997 Db2 Port - Server: region.techzone-services.com Port: 21361 There are two additional ports which are available for use with any service you install in the system. These ports are: Open Port 1 - Server: region.techzone-services.com: Port: 45779 Open Port 2 - Server: region.techzone-services.com: Port: 43151 The server URL will be different for each region and data center that your machine is provisioned on. The server name is usually in the format: region.techzone-services.com:port The port number that is provided in the reservation is mapped to the proper port number in the server. For instance, the Db2 Port number is 50000 in the server, but the reservation above shows a port number of 21361. Use the following rules for determining what server name and port number to use when connecting to the databases: If you are using a program \"inside\" the watsonx server, the host is watsonxdata or localhost . Some systems will require the watsonx service name (ibm-lh-postgres) and these are highlighted in the documentation. The port number will always be the native port (i.e. 5432 for PostgreSQL). If you are using a program \"outside\" the watsonx server, the host is the one provided in your reservation region.techzone-services.com and the port will the one that is included as part of the URL (i.e. 49618 in the example above). Watsonx.data Open Ports The following URLs and Ports are used to access the watsonx.data services. Most browsers will work with these URLs. Note : Mac OSX users should be aware that accessing the MinIO console may occasionally cause Firefox and Chrome to lock up. If you find that this occurs, try using Safari which appears to work fine. The ports that are used in the lab listed below, including their available when you first access the lab. The internal port number is always the same when running in the VMware image using the VM Remote Console. When using your workstation's browser, you will need to use the server name and port number supplied in the TechZone reservation. Service Port Active watsonx.data management console 9443 Yes Presto console 8443 Yes MinIO console (S3 buckets) 9001 Yes MinIO S3 Endpoint 9000 Yes Portainer (Docker container management) 6443 Yes Apache Superset (Query and Graphing) 8088 No Jupyter Notebook 8888 Yes Presto External Port 8443 Yes Hive metadata Port 9043 Yes MySQL External Port 3306 Yes Postgres External Port 5432 Yes Db2 Database Port 50000 Yes VNC Port 5901 No Note : The following ports are not active unless the service is started: Apache Superset (8088) VNC Terminal Display (5901) There are three https links will result in a Certificate error in Firefox: watsonx.data UI Presto UI Portainer UI Follow these steps to ignore the error when accessing these URLs. Select Advanced. Choose \"Accept the Risk and Continue\". If you are using Google Chrome, you can bypass the error message by typing in \"thisisunsafe\" or clicking on the \"Proceed to 192.168.252.2 (unsafe)\" link.","title":"Available Ports"},{"location":"wxd-reference-ports/#watsonxdata-ports","text":"The top of your TechZone reservation will contain a list of published services at the top of the details page. These URLs and Port numbers are needed to access the watsonx.data services. The list will contain the following information: SSH for watsonx userid - ssh -p 20200 watsonx@region.techzone-services.com MySQL Port - Server: region.techzone-services.com Port: 21409 PostgreSQL Port - Server: region.techzone-services.com Port: 38052 VNC Service - vnc://region.techzone-services.com:38725 Portainer console - https://region.techzone-services.com:44449 Apache Superset - http://region.techzone-services.com:41471 Presto console - https://region.techzone-services.com:49618 Presto Port - Server: region.techzone-services.com Port: 49618 Jupyter Notebook - Server: http://region.techzone-services.com:25490/notebooks/Table_of_Contents.ipynb Minio Endpoint - Server: region.techzone-services.com Port: 29652 Minio console - http://region.techzone-services.com:45050 Hive Thrift URL - thrift://region.techzone-services.com:22211 Watsonx UI - https://region.techzone-services.com:37997 Db2 Port - Server: region.techzone-services.com Port: 21361 There are two additional ports which are available for use with any service you install in the system. These ports are: Open Port 1 - Server: region.techzone-services.com: Port: 45779 Open Port 2 - Server: region.techzone-services.com: Port: 43151 The server URL will be different for each region and data center that your machine is provisioned on. The server name is usually in the format: region.techzone-services.com:port The port number that is provided in the reservation is mapped to the proper port number in the server. For instance, the Db2 Port number is 50000 in the server, but the reservation above shows a port number of 21361. Use the following rules for determining what server name and port number to use when connecting to the databases: If you are using a program \"inside\" the watsonx server, the host is watsonxdata or localhost . Some systems will require the watsonx service name (ibm-lh-postgres) and these are highlighted in the documentation. The port number will always be the native port (i.e. 5432 for PostgreSQL). If you are using a program \"outside\" the watsonx server, the host is the one provided in your reservation region.techzone-services.com and the port will the one that is included as part of the URL (i.e. 49618 in the example above).","title":"Watsonx.data Ports"},{"location":"wxd-reference-ports/#watsonxdata-open-ports","text":"The following URLs and Ports are used to access the watsonx.data services. Most browsers will work with these URLs. Note : Mac OSX users should be aware that accessing the MinIO console may occasionally cause Firefox and Chrome to lock up. If you find that this occurs, try using Safari which appears to work fine. The ports that are used in the lab listed below, including their available when you first access the lab. The internal port number is always the same when running in the VMware image using the VM Remote Console. When using your workstation's browser, you will need to use the server name and port number supplied in the TechZone reservation. Service Port Active watsonx.data management console 9443 Yes Presto console 8443 Yes MinIO console (S3 buckets) 9001 Yes MinIO S3 Endpoint 9000 Yes Portainer (Docker container management) 6443 Yes Apache Superset (Query and Graphing) 8088 No Jupyter Notebook 8888 Yes Presto External Port 8443 Yes Hive metadata Port 9043 Yes MySQL External Port 3306 Yes Postgres External Port 5432 Yes Db2 Database Port 50000 Yes VNC Port 5901 No Note : The following ports are not active unless the service is started: Apache Superset (8088) VNC Terminal Display (5901) There are three https links will result in a Certificate error in Firefox: watsonx.data UI Presto UI Portainer UI Follow these steps to ignore the error when accessing these URLs. Select Advanced. Choose \"Accept the Risk and Continue\". If you are using Google Chrome, you can bypass the error message by typing in \"thisisunsafe\" or clicking on the \"Proceed to 192.168.252.2 (unsafe)\" link.","title":"Watsonx.data Open Ports"},{"location":"wxd-reference-ssh/","text":"SSH Access All the commands in the lab will require you execute commands in a terminal window. Access to a terminal window can be accomplished in the three ways as described below: Use the SSH Command Locally Use Jupyter notebook terminal Terminal window in the VM If you are unable to use ssh because of connection restrictions, the Jupyter notebook option is the easiest one to use. SSH Command Your TechZone reservation will include the server name and port number to use when connecting using ssh. The port number is referred to as port in the command below, while the server will be referred to as region.techzone-server.com . Replace these values with those found in your reservation. You have the choice of using the VM Remote console and logging in as the watsonx user to issues commands, or using a local terminal shell (iTerm, Hyper, terminal) to run commands against the watsonx.data server. You can have multiple connections into the machine at any one time. It will be easier to cut-and-paste commands into a local terminal shell. The VM Remote Console does not support cut-and-paste operation from outside the virtual console environment. Open a terminal window and use the following syntax to connect as the watsonx userid. ssh -p port watsonx@region.techzone-server.com The port number and server name are provided as part of the TechZone reservation details. To become the root user, issue the following command. sudo su - Password for both users is watsonx.data . Jupyter Notebook Terminal The Jupyter Notebook lab environment provided as part of lab also provides a way of issuing terminal commands. To access this environment, you must find the Jupyter notebook URL provided in the lab reservation. When you initially open the link, it will request a password to view the Table of Contents: The default password for the notebook is watsonx.data . Once you enter the password, the Table of Contents will be displayed. Select the File menu at the top of the screen and then select New - Terminal. A terminal window will be displayed in the browser. You are now using a terminal session inside the watsonx.data server. By default, you are already the root user, so there is no need to run a sudo su - command. Note that you are not in the correct directory to run commands. You must issue the following command to be in the command directory. cd /root/ibm-lh-dev/bin All the commands in the lab can now be run from within this browser rather than using the VM Remote console. If at any time you accidentally close this window, you can open another one using the Jupyter notebook File menu. Terminal Window in the VM If you use the Remote VM Console , you can log into the watsonx.data user and use a Terminal shell to run commands against the watsonx.data server. Select the Terminal application in the virtual machine to issue commands. This will open up the terminal window. At this point you are connected as the watsonx user. You can ignore any lab instructions that ask you to ssh into the watsonx server. To become the root user, you must enter the following command in the terminal window. sudo su - Now as the root user you will be ready to run the commands found in the lab. Copying Files If you need to move files into or out of the virtual machine, you can use the following commands. To copy a file into the virtual machine use the following syntax: scp -P port myfile.txt watsonx@region.techzone-server.com:/tmp/myfile.txt The filename myfile.txt will be copied to the /tmp directory. The temporary directory is useful since you can copy the file to multiple places from within the Linux environment. Multiple files can be moved by using wildcard characters using the following syntax: scp -P port myfile.* watsonx@region.techzone-server.com:/tmp To move files from the image back to your local system requires you reverse the file specification. scp -P port watsonx@region.techzone-server.com:/tmp/myfile.txt /Downloads/myfile.txt You can also use wildcards to select more than one file.","title":"SSH and SCP Commands"},{"location":"wxd-reference-ssh/#ssh-access","text":"All the commands in the lab will require you execute commands in a terminal window. Access to a terminal window can be accomplished in the three ways as described below: Use the SSH Command Locally Use Jupyter notebook terminal Terminal window in the VM If you are unable to use ssh because of connection restrictions, the Jupyter notebook option is the easiest one to use.","title":"SSH Access"},{"location":"wxd-reference-ssh/#ssh-command","text":"Your TechZone reservation will include the server name and port number to use when connecting using ssh. The port number is referred to as port in the command below, while the server will be referred to as region.techzone-server.com . Replace these values with those found in your reservation. You have the choice of using the VM Remote console and logging in as the watsonx user to issues commands, or using a local terminal shell (iTerm, Hyper, terminal) to run commands against the watsonx.data server. You can have multiple connections into the machine at any one time. It will be easier to cut-and-paste commands into a local terminal shell. The VM Remote Console does not support cut-and-paste operation from outside the virtual console environment. Open a terminal window and use the following syntax to connect as the watsonx userid. ssh -p port watsonx@region.techzone-server.com The port number and server name are provided as part of the TechZone reservation details. To become the root user, issue the following command. sudo su - Password for both users is watsonx.data .","title":"SSH Command"},{"location":"wxd-reference-ssh/#jupyter-notebook-terminal","text":"The Jupyter Notebook lab environment provided as part of lab also provides a way of issuing terminal commands. To access this environment, you must find the Jupyter notebook URL provided in the lab reservation. When you initially open the link, it will request a password to view the Table of Contents: The default password for the notebook is watsonx.data . Once you enter the password, the Table of Contents will be displayed. Select the File menu at the top of the screen and then select New - Terminal. A terminal window will be displayed in the browser. You are now using a terminal session inside the watsonx.data server. By default, you are already the root user, so there is no need to run a sudo su - command. Note that you are not in the correct directory to run commands. You must issue the following command to be in the command directory. cd /root/ibm-lh-dev/bin All the commands in the lab can now be run from within this browser rather than using the VM Remote console. If at any time you accidentally close this window, you can open another one using the Jupyter notebook File menu.","title":"Jupyter Notebook Terminal"},{"location":"wxd-reference-ssh/#terminal-window-in-the-vm","text":"If you use the Remote VM Console , you can log into the watsonx.data user and use a Terminal shell to run commands against the watsonx.data server. Select the Terminal application in the virtual machine to issue commands. This will open up the terminal window. At this point you are connected as the watsonx user. You can ignore any lab instructions that ask you to ssh into the watsonx server. To become the root user, you must enter the following command in the terminal window. sudo su - Now as the root user you will be ready to run the commands found in the lab.","title":"Terminal Window in the VM"},{"location":"wxd-reference-ssh/#copying-files","text":"If you need to move files into or out of the virtual machine, you can use the following commands. To copy a file into the virtual machine use the following syntax: scp -P port myfile.txt watsonx@region.techzone-server.com:/tmp/myfile.txt The filename myfile.txt will be copied to the /tmp directory. The temporary directory is useful since you can copy the file to multiple places from within the Linux environment. Multiple files can be moved by using wildcard characters using the following syntax: scp -P port myfile.* watsonx@region.techzone-server.com:/tmp To move files from the image back to your local system requires you reverse the file specification. scp -P port watsonx@region.techzone-server.com:/tmp/myfile.txt /Downloads/myfile.txt You can also use wildcards to select more than one file.","title":"Copying Files"},{"location":"wxd-reference-techzone/","text":"Requesting a TechZone image If you are part of a workshop, you do not have to request a reservation. Instead, go to the Accessing a Workshop section. Log into TechZone ( https://techzone.ibm.com ) and search for the watsonx.data Developer Base Image or use the following link. https://techzone.ibm.com/collection/ibm-watsonxdata-developer-base-image If you have not logged into the IBM Cloud site, you will be asked to authenticate with your IBM userid. If you do not have an IBM userid, you will need to register for one. This lab is open to IBMers and Business Partners. Once you have logged in, you should see the following. Select the Environment tab on the far-left side. Note : There may be more than one environment available. Choose the one best suited for your requirements. Press the Reserve button. Select \"reserve now\" (why wait?). For \"Purpose\" select Self Education. This will expand to request additional information. Fill in the purpose field with something meaningful (watsonx.data education). Next select preferred Geography for the image. Choose any of the regions that are closest to your location. Note : The TechZone scheduler will pick a location in your region that has capacity to deploy your image. Previously you needed to pick a physical location (DAL10, WDC04, TOK02, etc...). The number of locations has expanded to 4 North American, 4 European and 2 AP locations which will hopefully provide more capacity to deploy the lab. If you find that your reservation is not being provisioned, check the status of the TechZone environment by referring to the TechZone status page at https://techzone.status.io . Next select the end date for the lab. Make sure you select enough time for you to use the lab! It defaults to 2 days, but you can extend the reservation! You do not need to enable VPN Access . Once you have completed the form, check the box indicating that you agree to the terms and conditions of using TechZone, and click SUBMIT on the bottom right-hand corner. At this point you will need to wait patiently for an email that acknowledges that your request has been placed into Provisioning mode. Eventually you will receive an email confirming that the system is ready to be used. Note that this can take a number of hours depending on the load on the TechZone servers. You may also get a message telling you that the system provisioning has Failed. Ignore the reason field since it is usually related to an environment failure caused by lack of resources. Check the status of TechZone first ( https://techzone.status.io ). If the systems appear to be okay, try requesting another image or using a different server location if possible. Contact TechZone support if you are having difficulties provisioning a system.","title":"Requesting an image"},{"location":"wxd-reference-techzone/#requesting-a-techzone-image","text":"If you are part of a workshop, you do not have to request a reservation. Instead, go to the Accessing a Workshop section. Log into TechZone ( https://techzone.ibm.com ) and search for the watsonx.data Developer Base Image or use the following link. https://techzone.ibm.com/collection/ibm-watsonxdata-developer-base-image If you have not logged into the IBM Cloud site, you will be asked to authenticate with your IBM userid. If you do not have an IBM userid, you will need to register for one. This lab is open to IBMers and Business Partners. Once you have logged in, you should see the following. Select the Environment tab on the far-left side. Note : There may be more than one environment available. Choose the one best suited for your requirements. Press the Reserve button. Select \"reserve now\" (why wait?). For \"Purpose\" select Self Education. This will expand to request additional information. Fill in the purpose field with something meaningful (watsonx.data education). Next select preferred Geography for the image. Choose any of the regions that are closest to your location. Note : The TechZone scheduler will pick a location in your region that has capacity to deploy your image. Previously you needed to pick a physical location (DAL10, WDC04, TOK02, etc...). The number of locations has expanded to 4 North American, 4 European and 2 AP locations which will hopefully provide more capacity to deploy the lab. If you find that your reservation is not being provisioned, check the status of the TechZone environment by referring to the TechZone status page at https://techzone.status.io . Next select the end date for the lab. Make sure you select enough time for you to use the lab! It defaults to 2 days, but you can extend the reservation! You do not need to enable VPN Access . Once you have completed the form, check the box indicating that you agree to the terms and conditions of using TechZone, and click SUBMIT on the bottom right-hand corner. At this point you will need to wait patiently for an email that acknowledges that your request has been placed into Provisioning mode. Eventually you will receive an email confirming that the system is ready to be used. Note that this can take a number of hours depending on the load on the TechZone servers. You may also get a message telling you that the system provisioning has Failed. Ignore the reason field since it is usually related to an environment failure caused by lack of resources. Check the status of TechZone first ( https://techzone.status.io ). If the systems appear to be okay, try requesting another image or using a different server location if possible. Contact TechZone support if you are having difficulties provisioning a system.","title":"Requesting a TechZone image"},{"location":"wxd-reference-vnc/","text":"Using the VM Remote Console The watsonx server that has been provisioned has no physical monitor attached to it (headless is what it is commonly referred to) and so we need to use a different technique to view the desktop of the main user or the system (watsonx). The first thing to consider is whether you need to use the VM Remote Console at all. All the services like the watsonx.data UI, MinIO, Presto, Apache Superset and Portainer, are all web-based servers which means you just need to use your own browser to access these programs. Connecting into the watsonx virtual machine can be done using the secure shell command (ssh) which provides access to all the low-level commands you might need to use like starting the Apache Superset service. Note that Apache Superset is not up and running by default, so you will need to start it before attempting to connect to it. So what's the VM Remote Console required for? One program that has been provided to view the database schemas is dBeaver, a community edition of software that provides a query interface to 100's of data sources, including the watsonx.data environment. You can only use this program using the VM Remote Console. You do have the option of installing this software on your own machine if you wish. Find your email message that contains details of your reservation. Details of what the reservations and the page containing details of the reservation can be found in the Accessing the reservation section. Once the details appear, scroll down to the bottom of the web page, and you will see the VM Remote Console button. You can access the logon screen of the virtual machine by pressing the VM Remote Console button. Clicking on this button will display the logon screen for the server. Select the watsonx user and use watsonx.data as the password. You can open this window in a separate browser window, or place it into fullscreen mode. Note that you may need to increase the size of your browser window (or change the scaling in the browser) to see all the virtual desktop. At this point you have access to the desktop of the watsonx user and can issue commands from within this environment. As mentioned previously, you do not need to use this interface to use the lab. Enabling VNC Access The watsonx.data image has port 5901 exposed for use with VNC browsers. If you want to use VNC instead of the VM Remote Console access, you must do the following: Make sure you are not currently logged in using the VM Virtual Console You have suitable VNC software (Mac OSX includes this). Use RealVNC or UltraVNC on a Windows box. You have a Terminal Shell open to issue root commands In a terminal window, ssh into the watsonx.data virtual machine as the watsonx user. Then you will need to become the root user and issue the following commands: sudo su - systemctl enable vncserver@:1 systemctl start vncserver@:1 systemctl daemon-reload After these commands complete, you will not be able to use the VM Remote Console to connect to the watsonx userid. Instead, you will need to use your VNC software to connect to the server. If at any time you want to turn off VNC support, issue the following commands: sudo su - systemctl disable vncserver@:1 systemctl stop vncserver@:1 systemctl daemon-reload Access watsonx.data on a Mac OSX system Once the VNC service has been started, you can connect to the machine using the VNC service by using the URL provided in your reservation document (sample URL below): VNC Service - vnc://region.techzone-server.com:28314 Use the Mac screen sharing app to connect to watsonx.data. You can connect using the OSX Safari browser by using the URL provided above. It will automatically start the screen sharing application. Note : The VNC URL format is only valid in Safari and may not work in other browsers. When the service connects to the server it will prompt for the password of the watsonx user - watsonx.data . Once connected you will see the console of the watsonx user. You may also want to consider making the screen size larger. Use the drop-down menu (Applications) at the top of the screen to select Other -> Settings. In the Devices section of the Setting menu, select Displays and choose a resolution that is suitable for your environment. Access watsonx.data on a Windows system Windows does not supply a native VNC browser. You will need to install a program like RealVNC or UltraVNC to access the console. Directions for installing UltraVNC are shown below. Note : This software has not been officially approved for use on Windows, although it has been tested against the watsonx.data server. UltraVNC UltraVNC is a VNC browser for the Windows environment. This is an open-source offering that can be found on Github in the UltraVNC project . The developers of this code have additional offerings that they sell for a fee and use advertising on their site to support their work on this and other projects. Because of the way the website has been designed, there are a number of ads and buttons that might distract you from the actual product you want to download. The official web page of UltraVNC is https://uvnc.com/ . Instead of going through all the menus, use the following link to go directly to the download screen: UltraVNC Download Link The website will display an initial download page that will wait for 10 seconds before you can do anything. This gives you time to read the advertising or make a voluntary donation to their coding efforts. Once the wait time has expired, you will see the following screen: Make sure to check that you accept the above conditions and press the download button. You should see the download progress in your browser. Click \"Open File\" against the downloaded file. Once the installer starts it will ask for you to approve changes to the system. Press Yes and then select the language you want. Press OK. Select Accept the license agreement and press Next. The summary of what it is going to install is shown on this screen. You will then have to provide the installation location or use the default location for the code. The next panel asks which components you want to install. Only install the viewer. Then it requests the name of the Shortcut folder. Just keep the name it recommends. Press Next. The next panel comes up that asks if you want a desktop shortcut. Probably a good idea if you are going to use it for a long period of time. Notice how they make this install take longer than expected to get you to donate? After all of that you will see the final installation screen. Click on Install and eventually you will get the completion notice. Pressing Finish will give you one final chance to donate. Make sure to unselect \"Show latest releases\" or else you will be directed back to their website. Using UltraVNC Start UltraVNC viewer by scrolling through your applications on your desktop. Choose the UltraVNC Viewer (not the listen mode ones unless you want to watch someone else using the desktop). When the service starts, it will ask for the server and port (Example below). VNC Service - vnc://region.techzone-server.com:28314 For the server you would enter region.techzone-server.com and the port would be 28314 . The examples below assume the IP address of 192.168.252.2 with a port number of 5901 . Before hitting enter, you may want to select the options button and change the setting in Miscellaneous to prevent advertising from being displayed while working with this program. Press connect when done. The password for the service is watsonx. which is watsonx with a period . at the end. At this point you should see the desktop of the virtual machine. You are now connected and can work on the watsonx.data desktop. If you find that performance is sluggish, this may be due to network latency. In the settings toolbar of the UltraVNC window, change the color resolution from Full to 256 . Your screen may look a bit washed out, but this will reduce the amount of data that needs to be sent over the network to render your screen.","title":"Using the VM Remote Console"},{"location":"wxd-reference-vnc/#using-the-vm-remote-console","text":"The watsonx server that has been provisioned has no physical monitor attached to it (headless is what it is commonly referred to) and so we need to use a different technique to view the desktop of the main user or the system (watsonx). The first thing to consider is whether you need to use the VM Remote Console at all. All the services like the watsonx.data UI, MinIO, Presto, Apache Superset and Portainer, are all web-based servers which means you just need to use your own browser to access these programs. Connecting into the watsonx virtual machine can be done using the secure shell command (ssh) which provides access to all the low-level commands you might need to use like starting the Apache Superset service. Note that Apache Superset is not up and running by default, so you will need to start it before attempting to connect to it. So what's the VM Remote Console required for? One program that has been provided to view the database schemas is dBeaver, a community edition of software that provides a query interface to 100's of data sources, including the watsonx.data environment. You can only use this program using the VM Remote Console. You do have the option of installing this software on your own machine if you wish. Find your email message that contains details of your reservation. Details of what the reservations and the page containing details of the reservation can be found in the Accessing the reservation section. Once the details appear, scroll down to the bottom of the web page, and you will see the VM Remote Console button. You can access the logon screen of the virtual machine by pressing the VM Remote Console button. Clicking on this button will display the logon screen for the server. Select the watsonx user and use watsonx.data as the password. You can open this window in a separate browser window, or place it into fullscreen mode. Note that you may need to increase the size of your browser window (or change the scaling in the browser) to see all the virtual desktop. At this point you have access to the desktop of the watsonx user and can issue commands from within this environment. As mentioned previously, you do not need to use this interface to use the lab.","title":"Using the VM Remote Console"},{"location":"wxd-reference-vnc/#enabling-vnc-access","text":"The watsonx.data image has port 5901 exposed for use with VNC browsers. If you want to use VNC instead of the VM Remote Console access, you must do the following: Make sure you are not currently logged in using the VM Virtual Console You have suitable VNC software (Mac OSX includes this). Use RealVNC or UltraVNC on a Windows box. You have a Terminal Shell open to issue root commands In a terminal window, ssh into the watsonx.data virtual machine as the watsonx user. Then you will need to become the root user and issue the following commands: sudo su - systemctl enable vncserver@:1 systemctl start vncserver@:1 systemctl daemon-reload After these commands complete, you will not be able to use the VM Remote Console to connect to the watsonx userid. Instead, you will need to use your VNC software to connect to the server. If at any time you want to turn off VNC support, issue the following commands: sudo su - systemctl disable vncserver@:1 systemctl stop vncserver@:1 systemctl daemon-reload","title":"Enabling VNC Access"},{"location":"wxd-reference-vnc/#access-watsonxdata-on-a-mac-osx-system","text":"Once the VNC service has been started, you can connect to the machine using the VNC service by using the URL provided in your reservation document (sample URL below): VNC Service - vnc://region.techzone-server.com:28314 Use the Mac screen sharing app to connect to watsonx.data. You can connect using the OSX Safari browser by using the URL provided above. It will automatically start the screen sharing application. Note : The VNC URL format is only valid in Safari and may not work in other browsers. When the service connects to the server it will prompt for the password of the watsonx user - watsonx.data . Once connected you will see the console of the watsonx user. You may also want to consider making the screen size larger. Use the drop-down menu (Applications) at the top of the screen to select Other -> Settings. In the Devices section of the Setting menu, select Displays and choose a resolution that is suitable for your environment.","title":"Access watsonx.data on a Mac OSX system"},{"location":"wxd-reference-vnc/#access-watsonxdata-on-a-windows-system","text":"Windows does not supply a native VNC browser. You will need to install a program like RealVNC or UltraVNC to access the console. Directions for installing UltraVNC are shown below. Note : This software has not been officially approved for use on Windows, although it has been tested against the watsonx.data server.","title":"Access watsonx.data on a Windows system"},{"location":"wxd-reference-vnc/#ultravnc","text":"UltraVNC is a VNC browser for the Windows environment. This is an open-source offering that can be found on Github in the UltraVNC project . The developers of this code have additional offerings that they sell for a fee and use advertising on their site to support their work on this and other projects. Because of the way the website has been designed, there are a number of ads and buttons that might distract you from the actual product you want to download. The official web page of UltraVNC is https://uvnc.com/ . Instead of going through all the menus, use the following link to go directly to the download screen: UltraVNC Download Link The website will display an initial download page that will wait for 10 seconds before you can do anything. This gives you time to read the advertising or make a voluntary donation to their coding efforts. Once the wait time has expired, you will see the following screen: Make sure to check that you accept the above conditions and press the download button. You should see the download progress in your browser. Click \"Open File\" against the downloaded file. Once the installer starts it will ask for you to approve changes to the system. Press Yes and then select the language you want. Press OK. Select Accept the license agreement and press Next. The summary of what it is going to install is shown on this screen. You will then have to provide the installation location or use the default location for the code. The next panel asks which components you want to install. Only install the viewer. Then it requests the name of the Shortcut folder. Just keep the name it recommends. Press Next. The next panel comes up that asks if you want a desktop shortcut. Probably a good idea if you are going to use it for a long period of time. Notice how they make this install take longer than expected to get you to donate? After all of that you will see the final installation screen. Click on Install and eventually you will get the completion notice. Pressing Finish will give you one final chance to donate. Make sure to unselect \"Show latest releases\" or else you will be directed back to their website.","title":"UltraVNC"},{"location":"wxd-reference-vnc/#using-ultravnc","text":"Start UltraVNC viewer by scrolling through your applications on your desktop. Choose the UltraVNC Viewer (not the listen mode ones unless you want to watch someone else using the desktop). When the service starts, it will ask for the server and port (Example below). VNC Service - vnc://region.techzone-server.com:28314 For the server you would enter region.techzone-server.com and the port would be 28314 . The examples below assume the IP address of 192.168.252.2 with a port number of 5901 . Before hitting enter, you may want to select the options button and change the setting in Miscellaneous to prevent advertising from being displayed while working with this program. Press connect when done. The password for the service is watsonx. which is watsonx with a period . at the end. At this point you should see the desktop of the virtual machine. You are now connected and can work on the watsonx.data desktop. If you find that performance is sluggish, this may be due to network latency. In the settings toolbar of the UltraVNC window, change the color resolution from Full to 256 . Your screen may look a bit washed out, but this will reduce the amount of data that needs to be sent over the network to render your screen.","title":"Using UltraVNC"},{"location":"wxd-reference-workshop/","text":"Accessing a Workshop To access a watsonx.data workshop, you will need to have an IBM userid and a link provided to your class instructor. This link will first ask you to log into the system using your IBM userid, and then a screen similar to the following will be displayed: The title of the workshop may be different, but the steps to access the lab will remain the same. The class instructor will have provided a unique password for the course. Enter that value into the password/access code box and click on the Submit button. When the connection is successful, the details of your environment will be shown in the browser. The top of the page contains all the published services that you will use during the lab. For instance, if the lab requires that you access the Presto console, you would click on the link in this browser which says: Presto console - https://useast.techzone-services.com:xxxxx At the bottom of the reservation you will find a button that provides access to the machine console. The large blue button labelled VM Remote Console will provide access to the Linux operating system that the watsonx.data server is running on. See the section on VM Remote Console","title":"Accessing a workshop"},{"location":"wxd-reference-workshop/#accessing-a-workshop","text":"To access a watsonx.data workshop, you will need to have an IBM userid and a link provided to your class instructor. This link will first ask you to log into the system using your IBM userid, and then a screen similar to the following will be displayed: The title of the workshop may be different, but the steps to access the lab will remain the same. The class instructor will have provided a unique password for the course. Enter that value into the password/access code box and click on the Submit button. When the connection is successful, the details of your environment will be shown in the browser. The top of the page contains all the published services that you will use during the lab. For instance, if the lab requires that you access the Presto console, you would click on the link in this browser which says: Presto console - https://useast.techzone-services.com:xxxxx At the bottom of the reservation you will find a button that provides access to the machine console. The large blue button labelled VM Remote Console will provide access to the Linux operating system that the watsonx.data server is running on. See the section on VM Remote Console","title":"Accessing a Workshop"},{"location":"wxd-revisions/","text":"Revisions February 29, 2024 (1.1.2) SSL connection for data sources You can now enable SSL connection for the following data sources by using the Add database user interface to secure and encrypt the database connection : Db2 PostgreSQL IBM Data Virtualization Manager for z/OS For IBM Data Virtualization Manager for z/OS and PostgreSQL, select Validate certificate to validate whether the SSL certificate that is returned by the host is trusted. For the IBM Data Virtualization Manager for z/OS data source, you can choose to provide the hostname in the SSL certificate. text Secure ingestion job history Now users can view only their own ingestion job history. Administrators can view the ingestion job history for all users. New data types BLOB and CLOB for SAPHANA and Teradata data sources New data types BLOB and CLOB are available for SAPHANA and Teradata data sources. You can use these data types only with SELECT statements in the Query workspace to build and run queries against your data. Use more SQL statements You can now use the following SQL statements in the Query workspace to build and run queries against your data: Apache Iceberg data sources: CREATE VIEW DROP VIEW MongoDB data sources: DELETE Create a new table during data ingestion Previously, you had to have a target table in watsonx.data for ingesting data. Now, you can create a new table directly from the source data file (available in parquet or CSV format) by using data ingestion through the watsonx.data user interface. You can create the table by using the following methods of ingestion: Ingesting data by using Iceberg copy loader Ingesting data by using Spark Perform ALTER TABLE operations on a column With an Iceberg data source, you can now perform ALTER TABLE operations on a column for the following data type conversions: int to bigint float to double decimal (num1, dec_digits) to decimal (num2, dec_digits), where num2>num1. Better query performance by using sorted files With an Iceberg data source, you can generate sorted files, which reduce the query result latency and improve the performance of Presto. Data in the Apache Iceberg table is sorted during the writing process within each file. You can configure the order to sort the data by using the sorted_by table property. When you create the table, specify the array of columns involved in sorting. Exposing Hive metastore port details (Developer edition) You can now expose the Hive metastore port details outside the watsonx.data developer edition's host to facilitate connection from external applications (services outside of docker or Podman), such as the integration with Db2, and Spark to watsonx.data. January 25, 2024 (1.1.1) Updated Lab Documentation Instructions for using a Workshop environment New section on user administration and creating policies Update to running terminal commands uses Jupyter notebook shell January 8, 2024 (1.1.1) Updated the lab to GA watsonx.data 1.1.1 code What's new in watsonx.data version 1.1.1 Reference Audit logging IBM watsonx.data now integrates with the Cloud Pak for Data audit logging service. Auditable events for watsonx.data are forwarded to the security information and event management (SIEM) solution that you integrate with. Use self-signed certificates and CA certificates to connect to object stores Previously, watsonx.data could connect to HTTPS endpoints that used certificates signed by well-known certificate authorities, such as IBM Cloud\u00ae Object Storage and Amazon S3. Now, you can connect to object stores that use self-signed certificates or certificates that are signed by other certificate authorities. Integration with Db2\u00ae and Netezza\u00ae You can now register Db2 or Netezza engines with valid console URL. You can use the metastore URL shown in Engine detail page to sync the respective engines with appropriate bucket catalog-based table. IBM Data Virtualization Manager for z/OS\u00ae connector You can use the new IBM Data Virtualization Manager for z/OS\u00ae connector to read and write IBM Z\u00ae without having to move, replicate, or transform the data. For more information, see Connecting to an IBM Data Virtualization Manager (DVM) data source. Better memory management Metastore caching and metadata caching (header and footer caching) are now enabled by default to optimize the memory usage. Also, now you can create a local staging directory to optimize the use of resources during data operations. For more information, see Enhancing the query performance through caching and Configuring a local staging directory. Presto case-sensitive behavior The Presto behavior is changed from case-insensitive to case-sensitive. Now you can provide the object names in original case format as in the database. You can also create Schemas, Tables and Columns in mixed case that is, uppercase and lowercase through Presto if the database supports it. Teradata connector is enabled for multiple ALTER TABLE statements Teradata connector now supports the ALTER TABLE RENAME TO, ALTER TABLE DROP COLUMN, ALTER TABLE RENAME COLUMN column_name TO new_column_name statements. Removal of development (*-devel) packages For security reasons, the *-devel packages are removed from watsonx.data. If you are already using the development packages, the programs that use the development packages cannot be compiled . For any queries, contact IBM Support. SSL is enabled for PostgreSQL Now ingestion can use mounted certificates when connecting to PostgreSQL. January 3, 2024 (1.1.0) Added two open ports to the image Sometimes there is a requirement to add another service to the watsonx.data image. For instance, you may want to add MongoDB or MSSQL to the system in order to demonstrate federations with these data source. Since we do not know what your requirements are, we have opened up two ports that can be assigned to any service. The documentation has been updated to describe what steps are needed to use these open up and use these ports. December 6, 2023 (1.1.0) Updated the lab to GA 1.1.0 code What's new in watsonx.data version 1.1.0 Reference Time-travel and roll-back queries You can now run the following time-travel queries to access historical data in Apache Iceberg tables: SELECT <columns> FROM <iceberg-table> FOR TIMESTAMP AS OF TIMESTAMP <timestamp> SELECT <columns> FROM <iceberg-table> FOR VERSION AS OF <snapshotId> You can use time-travel queries to query and restore data that was updated or deleted in the past. You can also roll back an Apache Iceberg table to any existing snapshot. Capture historical data about Presto queries The Query History Monitoring and Management (QHMM) service captures historical data about Presto queries and events. The historical data is stored in a MinIO bucket and you can use the data to understand the queries that were run and to debug the Presto engine. Improved query performance with Metastore, File list, and File metadata caching You can now capture and track the DDL changes in watsonx.data by using an event listener. Ingest data by using Spark You can now use the IBM Analytics Engine powered by Apache Spark to run ingestion jobs in watsonx.data. Integration with Db2 and Netezza Performance Server You can now register Db2 or Netezza Performance Server engines in watsonx.data console. New connectors You can now use connectors in watsonx.data to establish connections to the following types of databases: Teradata Delta Lake Elasticsearch SAP HANA SingleStoreDB Snowflake Teradata Db2 Upgraded to 11.5.9 What's new in Db2 11.5.9 Reference October 6, 2023 (1.0.3) Updated the lab to GA 1.0.3 code What's new in watsonx.data version 1.1.0 Image now available in 10 data centers with simpler provisioning Requesting an Image Removed VPN Requirement External URLs and Ports for all UI Services watsonx.data Ports Added PostgreSQL and MySQL databases Postgres Connection MySQL Connection Added Jupyter notebook examples Jupyter Notebook support Fixed Presto certificate to support TechZone addresses without updating /etc/hosts Watsonx.data Connection Certificate Added standalone Spark server to show connectivity to the Presto database Accessing watsonx.data with Spark Added watsonx.data Client code Watsonx.data client utilities Added MinIO CLI interface MinIO CLI Exposed external ports for MinIO, Db2, MySQL, PostgreSQL, Hive, PrestoDB watsonx.data Ports VNC Interface disabled by default Enabling VNC Access Added Ingesting data chapter Ingesting Data July 25, 2023 (1.0.1) Updated the lab to GA 1.0.1 code Automated start of watsonx.data and simplification of many of the sections Removed the Ingest section until a new version is available Added Db2 and PostgreSQL connection details June 12, 2023 (1.0.0) Clarified some commands and added an Appendix on common issues. June 6, 2023 (1.0.0) Updated instructions for new TechZone image and added Ingest lab instructions. May 25th, 2023 (1.0.0) Initial publication.","title":"What's New"},{"location":"wxd-revisions/#revisions","text":"","title":"Revisions"},{"location":"wxd-revisions/#february-29-2024-112","text":"SSL connection for data sources You can now enable SSL connection for the following data sources by using the Add database user interface to secure and encrypt the database connection : Db2 PostgreSQL IBM Data Virtualization Manager for z/OS For IBM Data Virtualization Manager for z/OS and PostgreSQL, select Validate certificate to validate whether the SSL certificate that is returned by the host is trusted. For the IBM Data Virtualization Manager for z/OS data source, you can choose to provide the hostname in the SSL certificate. text Secure ingestion job history Now users can view only their own ingestion job history. Administrators can view the ingestion job history for all users. New data types BLOB and CLOB for SAPHANA and Teradata data sources New data types BLOB and CLOB are available for SAPHANA and Teradata data sources. You can use these data types only with SELECT statements in the Query workspace to build and run queries against your data. Use more SQL statements You can now use the following SQL statements in the Query workspace to build and run queries against your data: Apache Iceberg data sources: CREATE VIEW DROP VIEW MongoDB data sources: DELETE Create a new table during data ingestion Previously, you had to have a target table in watsonx.data for ingesting data. Now, you can create a new table directly from the source data file (available in parquet or CSV format) by using data ingestion through the watsonx.data user interface. You can create the table by using the following methods of ingestion: Ingesting data by using Iceberg copy loader Ingesting data by using Spark Perform ALTER TABLE operations on a column With an Iceberg data source, you can now perform ALTER TABLE operations on a column for the following data type conversions: int to bigint float to double decimal (num1, dec_digits) to decimal (num2, dec_digits), where num2>num1. Better query performance by using sorted files With an Iceberg data source, you can generate sorted files, which reduce the query result latency and improve the performance of Presto. Data in the Apache Iceberg table is sorted during the writing process within each file. You can configure the order to sort the data by using the sorted_by table property. When you create the table, specify the array of columns involved in sorting. Exposing Hive metastore port details (Developer edition) You can now expose the Hive metastore port details outside the watsonx.data developer edition's host to facilitate connection from external applications (services outside of docker or Podman), such as the integration with Db2, and Spark to watsonx.data.","title":"February 29, 2024 (1.1.2)"},{"location":"wxd-revisions/#january-25-2024-111","text":"Updated Lab Documentation Instructions for using a Workshop environment New section on user administration and creating policies Update to running terminal commands uses Jupyter notebook shell","title":"January 25, 2024 (1.1.1)"},{"location":"wxd-revisions/#january-8-2024-111","text":"Updated the lab to GA watsonx.data 1.1.1 code What's new in watsonx.data version 1.1.1 Reference Audit logging IBM watsonx.data now integrates with the Cloud Pak for Data audit logging service. Auditable events for watsonx.data are forwarded to the security information and event management (SIEM) solution that you integrate with. Use self-signed certificates and CA certificates to connect to object stores Previously, watsonx.data could connect to HTTPS endpoints that used certificates signed by well-known certificate authorities, such as IBM Cloud\u00ae Object Storage and Amazon S3. Now, you can connect to object stores that use self-signed certificates or certificates that are signed by other certificate authorities. Integration with Db2\u00ae and Netezza\u00ae You can now register Db2 or Netezza engines with valid console URL. You can use the metastore URL shown in Engine detail page to sync the respective engines with appropriate bucket catalog-based table. IBM Data Virtualization Manager for z/OS\u00ae connector You can use the new IBM Data Virtualization Manager for z/OS\u00ae connector to read and write IBM Z\u00ae without having to move, replicate, or transform the data. For more information, see Connecting to an IBM Data Virtualization Manager (DVM) data source. Better memory management Metastore caching and metadata caching (header and footer caching) are now enabled by default to optimize the memory usage. Also, now you can create a local staging directory to optimize the use of resources during data operations. For more information, see Enhancing the query performance through caching and Configuring a local staging directory. Presto case-sensitive behavior The Presto behavior is changed from case-insensitive to case-sensitive. Now you can provide the object names in original case format as in the database. You can also create Schemas, Tables and Columns in mixed case that is, uppercase and lowercase through Presto if the database supports it. Teradata connector is enabled for multiple ALTER TABLE statements Teradata connector now supports the ALTER TABLE RENAME TO, ALTER TABLE DROP COLUMN, ALTER TABLE RENAME COLUMN column_name TO new_column_name statements. Removal of development (*-devel) packages For security reasons, the *-devel packages are removed from watsonx.data. If you are already using the development packages, the programs that use the development packages cannot be compiled . For any queries, contact IBM Support. SSL is enabled for PostgreSQL Now ingestion can use mounted certificates when connecting to PostgreSQL.","title":"January 8, 2024 (1.1.1)"},{"location":"wxd-revisions/#january-3-2024-110","text":"Added two open ports to the image Sometimes there is a requirement to add another service to the watsonx.data image. For instance, you may want to add MongoDB or MSSQL to the system in order to demonstrate federations with these data source. Since we do not know what your requirements are, we have opened up two ports that can be assigned to any service. The documentation has been updated to describe what steps are needed to use these open up and use these ports.","title":"January 3, 2024 (1.1.0)"},{"location":"wxd-revisions/#december-6-2023-110","text":"Updated the lab to GA 1.1.0 code What's new in watsonx.data version 1.1.0 Reference Time-travel and roll-back queries You can now run the following time-travel queries to access historical data in Apache Iceberg tables: SELECT <columns> FROM <iceberg-table> FOR TIMESTAMP AS OF TIMESTAMP <timestamp> SELECT <columns> FROM <iceberg-table> FOR VERSION AS OF <snapshotId> You can use time-travel queries to query and restore data that was updated or deleted in the past. You can also roll back an Apache Iceberg table to any existing snapshot. Capture historical data about Presto queries The Query History Monitoring and Management (QHMM) service captures historical data about Presto queries and events. The historical data is stored in a MinIO bucket and you can use the data to understand the queries that were run and to debug the Presto engine. Improved query performance with Metastore, File list, and File metadata caching You can now capture and track the DDL changes in watsonx.data by using an event listener. Ingest data by using Spark You can now use the IBM Analytics Engine powered by Apache Spark to run ingestion jobs in watsonx.data. Integration with Db2 and Netezza Performance Server You can now register Db2 or Netezza Performance Server engines in watsonx.data console. New connectors You can now use connectors in watsonx.data to establish connections to the following types of databases: Teradata Delta Lake Elasticsearch SAP HANA SingleStoreDB Snowflake Teradata Db2 Upgraded to 11.5.9 What's new in Db2 11.5.9 Reference","title":"December 6, 2023 (1.1.0)"},{"location":"wxd-revisions/#october-6-2023-103","text":"Updated the lab to GA 1.0.3 code What's new in watsonx.data version 1.1.0 Image now available in 10 data centers with simpler provisioning Requesting an Image Removed VPN Requirement External URLs and Ports for all UI Services watsonx.data Ports Added PostgreSQL and MySQL databases Postgres Connection MySQL Connection Added Jupyter notebook examples Jupyter Notebook support Fixed Presto certificate to support TechZone addresses without updating /etc/hosts Watsonx.data Connection Certificate Added standalone Spark server to show connectivity to the Presto database Accessing watsonx.data with Spark Added watsonx.data Client code Watsonx.data client utilities Added MinIO CLI interface MinIO CLI Exposed external ports for MinIO, Db2, MySQL, PostgreSQL, Hive, PrestoDB watsonx.data Ports VNC Interface disabled by default Enabling VNC Access Added Ingesting data chapter Ingesting Data","title":"October 6, 2023 (1.0.3)"},{"location":"wxd-revisions/#july-25-2023-101","text":"Updated the lab to GA 1.0.1 code Automated start of watsonx.data and simplification of many of the sections Removed the Ingest section until a new version is available Added Db2 and PostgreSQL connection details","title":"July 25, 2023 (1.0.1)"},{"location":"wxd-revisions/#june-12-2023-100","text":"Clarified some commands and added an Appendix on common issues.","title":"June 12, 2023 (1.0.0)"},{"location":"wxd-revisions/#june-6-2023-100","text":"Updated instructions for new TechZone image and added Ingest lab instructions.","title":"June 6, 2023 (1.0.0)"},{"location":"wxd-revisions/#may-25th-2023-100","text":"Initial publication.","title":"May 25th, 2023 (1.0.0)"},{"location":"wxd-startwatsonx/","text":"Lab Instructions URL Conventions Your TechZone reservation contains a number of URLs for the services provided in the watsonx.data server. The URL will contain the name of the server and the corresponding port number for the service. Throughout the documentation, the server name will be referred to as region.techzone-server.com and port number is referred to as port . Where you see these URLS, replace them with the values found in your reservation. Commands Throughout the labs, any command that needs to be executed will be highlighted in a grey box: cd /root/ibm-lh-dev/bin A copy icon is usually found on the far right-hand side of the command box. Use this to copy the text and paste it into your command window. You can also select the text and copy it that way. Note that some commands may span multiple lines, so make sure you copy everything in the box. System Check Your TechZone reservation will include the server name and port number to use when connecting using ssh. The port number is referred to as port in the command below, while the server will be referred to as region.techzone-server.com . Replace these values with those found in your reservation. Make sure that you have an open terminal session and use the following command to connect to the watsonx.data server. ssh -p port watsonx@region.techzone-server.com Password is watsonx.data . Next switch to the root userid. sudo su - Switch to the development code bin directory. cd /root/ibm-lh-dev/bin Once you have switched to the development directory, you can start running watsonx.data commands. You can check the status with the following command. ./status.sh --all Output will look similar to: using /root/ibm-lh-dev/localstorage/volumes as data root directory for user: root/1001 infra config location is /root/ibm-lh-dev/localstorage/volumes/infra lhconsole-ui running 0.0.0.0:9443->8443/tcp, :::9443->8443/tcp lhconsole-nodeclient-svc running 3001/tcp lhconsole-javaapi-svc running 8090/tcp lhconsole-api running 3333/tcp, 8081/tcp ibm-lh-presto running 0.0.0.0:8443->8443/tcp, :::8443->8443/tcp ibm-lh-hive-metastore running ibm-lh-postgres running 5432/tcp ibm-lh-minio running To confirm that the software is working, run the following commands to validate the installation. Presto Engine Test Check the Presto engine by connecting to a schema. First, we need to make sure that the Presto engine has completed all startup tasks. The following command is not part of watsonx.data, but has been included to simplify checking the status of the Presto service. check_presto Waiting for Presto to start. ........................... Ready Note : If the starting message may take up to 5 minutes when the system first initializes. Once the command returns \"Ready\" you can connect to the presto CLI. ./presto-cli --catalog tpch --schema tiny Check the record count of the customer table. Note : If the Presto engine has not yet started (you didn't run the check_presto script), the next command may result in a useless Java error message. You may need to wait for a minute for attempting to run the statement again. select * from customer limit 10; All Presto commands end with a semi-colon. The result set should include the a number of rows (the results will be random). custkey | name | address | nationkey | phone | acctbal | mktsegment | comment ---------+--------------------+---------------------------------------+-----------+-----------------+---------+------------+------------------------------------------------------------------------------------------------------------------- 1 | Customer#000000001 | IVhzIApeRb ot,c,E | 15 | 25-989-741-2988 | 711.56 | BUILDING | to the even, regular platelets. regular, ironic epitaphs nag e 2 | Customer#000000002 | XSTf4,NCwDVaWNe6tEgvwfmRchLXak | 13 | 23-768-687-3665 | 121.65 | AUTOMOBILE | l accounts. blithely ironic theodolites integrate boldly: caref 3 | Customer#000000003 | MG9kdTD2WBHm | 1 | 11-719-748-3364 | 7498.12 | AUTOMOBILE | deposits eat slyly ironic, even instructions. express foxes detect slyly. blithely even accounts abov 4 | Customer#000000004 | XxVSJsLAGtn | 4 | 14-128-190-5944 | 2866.83 | MACHINERY | requests. final, regular ideas sleep final accou 5 | Customer#000000005 | KvpyuHCplrB84WgAiGV6sYpZq7Tj | 3 | 13-750-942-6364 | 794.47 | HOUSEHOLD | n accounts will have to unwind. foxes cajole accor 6 | Customer#000000006 | sKZz0CsnMD7mp4Xd0YrBvx,LREYKUWAh yVn | 20 | 30-114-968-4951 | 7638.57 | AUTOMOBILE | tions. even deposits boost according to the slyly bold packages. final accounts cajole requests. furious 7 | Customer#000000007 | TcGe5gaZNgVePxU5kRrvXBfkasDTea | 18 | 28-190-982-9759 | 9561.95 | AUTOMOBILE | ainst the ironic, express theodolites. express, even pinto beans among the exp 8 | Customer#000000008 | I0B10bB0AymmC, 0PrRYBCP1yGJ8xcBPmWhl5 | 17 | 27-147-574-9335 | 6819.74 | BUILDING | among the slyly regular theodolites kindle blithely courts. carefully even theodolites haggle slyly along the ide 9 | Customer#000000009 | xKiAFTjUsCuxfeleNqefumTrjS | 8 | 18-338-906-3675 | 8324.07 | FURNITURE | r theodolites according to the requests wake thinly excuses: pending requests haggle furiousl 10 | Customer#000000010 | 6LrEaV6KR6PLVcgl2ArL Q3rqzLzcT1 v2 | 5 | 15-741-346-9870 | 2753.54 | HOUSEHOLD | es regular deposits haggle. fur (10 rows) The output on your screen will look similar to the following: The arrows on the far right side indicate that there is more output to view. Press the right and left arrows on your keyboard to scroll the display. If the result set is small, all of the results will display on the screen and no scrolling will be available unless the results are wider than the screen size. When thje display shows (END) you have reached the bottom of the output. If the display shows a colon ( : ) at the bottom of the screen, you can use the up and down arrow keys to scroll a record at a time, or the Page Up and Page Down keys to scroll a page at a time. To quit viewing the output, press the Q key. Quit the Presto CLI. The Presto quit command can be used with or without a semicolon. quit; Congratulations, your system is now up and running!","title":"Lab Instructions"},{"location":"wxd-startwatsonx/#lab-instructions","text":"","title":"Lab Instructions"},{"location":"wxd-startwatsonx/#url-conventions","text":"Your TechZone reservation contains a number of URLs for the services provided in the watsonx.data server. The URL will contain the name of the server and the corresponding port number for the service. Throughout the documentation, the server name will be referred to as region.techzone-server.com and port number is referred to as port . Where you see these URLS, replace them with the values found in your reservation.","title":"URL Conventions"},{"location":"wxd-startwatsonx/#commands","text":"Throughout the labs, any command that needs to be executed will be highlighted in a grey box: cd /root/ibm-lh-dev/bin A copy icon is usually found on the far right-hand side of the command box. Use this to copy the text and paste it into your command window. You can also select the text and copy it that way. Note that some commands may span multiple lines, so make sure you copy everything in the box.","title":"Commands"},{"location":"wxd-startwatsonx/#system-check","text":"Your TechZone reservation will include the server name and port number to use when connecting using ssh. The port number is referred to as port in the command below, while the server will be referred to as region.techzone-server.com . Replace these values with those found in your reservation. Make sure that you have an open terminal session and use the following command to connect to the watsonx.data server. ssh -p port watsonx@region.techzone-server.com Password is watsonx.data . Next switch to the root userid. sudo su - Switch to the development code bin directory. cd /root/ibm-lh-dev/bin Once you have switched to the development directory, you can start running watsonx.data commands. You can check the status with the following command. ./status.sh --all Output will look similar to: using /root/ibm-lh-dev/localstorage/volumes as data root directory for user: root/1001 infra config location is /root/ibm-lh-dev/localstorage/volumes/infra lhconsole-ui running 0.0.0.0:9443->8443/tcp, :::9443->8443/tcp lhconsole-nodeclient-svc running 3001/tcp lhconsole-javaapi-svc running 8090/tcp lhconsole-api running 3333/tcp, 8081/tcp ibm-lh-presto running 0.0.0.0:8443->8443/tcp, :::8443->8443/tcp ibm-lh-hive-metastore running ibm-lh-postgres running 5432/tcp ibm-lh-minio running To confirm that the software is working, run the following commands to validate the installation.","title":"System Check"},{"location":"wxd-startwatsonx/#presto-engine-test","text":"Check the Presto engine by connecting to a schema. First, we need to make sure that the Presto engine has completed all startup tasks. The following command is not part of watsonx.data, but has been included to simplify checking the status of the Presto service. check_presto Waiting for Presto to start. ........................... Ready Note : If the starting message may take up to 5 minutes when the system first initializes. Once the command returns \"Ready\" you can connect to the presto CLI. ./presto-cli --catalog tpch --schema tiny Check the record count of the customer table. Note : If the Presto engine has not yet started (you didn't run the check_presto script), the next command may result in a useless Java error message. You may need to wait for a minute for attempting to run the statement again. select * from customer limit 10; All Presto commands end with a semi-colon. The result set should include the a number of rows (the results will be random). custkey | name | address | nationkey | phone | acctbal | mktsegment | comment ---------+--------------------+---------------------------------------+-----------+-----------------+---------+------------+------------------------------------------------------------------------------------------------------------------- 1 | Customer#000000001 | IVhzIApeRb ot,c,E | 15 | 25-989-741-2988 | 711.56 | BUILDING | to the even, regular platelets. regular, ironic epitaphs nag e 2 | Customer#000000002 | XSTf4,NCwDVaWNe6tEgvwfmRchLXak | 13 | 23-768-687-3665 | 121.65 | AUTOMOBILE | l accounts. blithely ironic theodolites integrate boldly: caref 3 | Customer#000000003 | MG9kdTD2WBHm | 1 | 11-719-748-3364 | 7498.12 | AUTOMOBILE | deposits eat slyly ironic, even instructions. express foxes detect slyly. blithely even accounts abov 4 | Customer#000000004 | XxVSJsLAGtn | 4 | 14-128-190-5944 | 2866.83 | MACHINERY | requests. final, regular ideas sleep final accou 5 | Customer#000000005 | KvpyuHCplrB84WgAiGV6sYpZq7Tj | 3 | 13-750-942-6364 | 794.47 | HOUSEHOLD | n accounts will have to unwind. foxes cajole accor 6 | Customer#000000006 | sKZz0CsnMD7mp4Xd0YrBvx,LREYKUWAh yVn | 20 | 30-114-968-4951 | 7638.57 | AUTOMOBILE | tions. even deposits boost according to the slyly bold packages. final accounts cajole requests. furious 7 | Customer#000000007 | TcGe5gaZNgVePxU5kRrvXBfkasDTea | 18 | 28-190-982-9759 | 9561.95 | AUTOMOBILE | ainst the ironic, express theodolites. express, even pinto beans among the exp 8 | Customer#000000008 | I0B10bB0AymmC, 0PrRYBCP1yGJ8xcBPmWhl5 | 17 | 27-147-574-9335 | 6819.74 | BUILDING | among the slyly regular theodolites kindle blithely courts. carefully even theodolites haggle slyly along the ide 9 | Customer#000000009 | xKiAFTjUsCuxfeleNqefumTrjS | 8 | 18-338-906-3675 | 8324.07 | FURNITURE | r theodolites according to the requests wake thinly excuses: pending requests haggle furiousl 10 | Customer#000000010 | 6LrEaV6KR6PLVcgl2ArL Q3rqzLzcT1 v2 | 5 | 15-741-346-9870 | 2753.54 | HOUSEHOLD | es regular deposits haggle. fur (10 rows) The output on your screen will look similar to the following: The arrows on the far right side indicate that there is more output to view. Press the right and left arrows on your keyboard to scroll the display. If the result set is small, all of the results will display on the screen and no scrolling will be available unless the results are wider than the screen size. When thje display shows (END) you have reached the bottom of the output. If the display shows a colon ( : ) at the bottom of the screen, you can use the up and down arrow keys to scroll a record at a time, or the Page Up and Page Down keys to scroll a page at a time. To quit viewing the output, press the Q key. Quit the Presto CLI. The Presto quit command can be used with or without a semicolon. quit; Congratulations, your system is now up and running!","title":"Presto Engine Test"},{"location":"wxd-superset/","text":"Reporting/Dashboarding using Apache Superset Apache Superset is not a part of watsonx.data and is only used to demonstrate the capability to connect to watsonx.data from other BI/Reporting tools. You will need to install Apache Superset as part of this lab. The Superset repository needs to be in sync with the image being downloaded, so these libraries cannot be preloaded into this development image. Open a terminal window and connect via SSH as the watsonx user. Do not connect as the root user. Clone the Apache Superset repository with the git command. This command typically takes less than 1 minute to download the code. git clone https://github.com/apache/superset.git The docker-compose-non-dev.yml file needs to be updated so that Apache Superset can access the same network that watsonx.data is using. cd ./superset cp docker-compose-non-dev.yml docker-compose-non-dev-backup.yml sed '/version: \"3.7\"/q' docker-compose-non-dev.yml > yamlfix.txt cat <<EOF >> yamlfix.txt networks: default: external: True name: ibm-lh-network EOF sed -e '1,/version: \"3.7\"/ d' docker-compose-non-dev.yml >> yamlfix.txt We update the Apache Superset code to version 2.1.0 . sed 's/\\${TAG:-latest-dev}/2.1.0/' yamlfix.txt > docker-compose-non-dev.yml Use docker-compose to start Apache Superset. docker compose -f docker-compose-non-dev.yml up The docker compose command will download the necessary code for Apache Superset and start the service. The terminal session will contain the logging information for the service. The process is running in the foreground so you will see all the messages being produced by the program. If you want to stop the service at any time you will need to press CTRL-C. If you close this terminal window at any time, the process will stop. When you see \"Init Step 4/4\", the service is ready for connections. If you have already installed Apache Superset and you stopped it, there is no need to reinstall the program. Go back to the /home/watsonx/superset directory and run the docker compose program again: Once the service is running, open your browser and navigate to the URL and port that were provided in your TechZone reservation. The credentials for Apache Superset are userid admin , Password admin . \u2003 Setup a Database Connection to watsonx.data Open another terminal window for this next step. Once Apache Superset has started loading examples, you can issue the following command as watsonx or root . docker cp /certs/lh-ssl-ts.crt superset_app:/tmp/lh-ssl-ts.crt In the Apache Superset console, press the Settings button on the far right and select Database connections. Then select the [+ DATABASE] option on the far-right side of the panel. \u2003 A connection dialog will display. Select Presto as the database connection type. In the SQLALCHEMY URI field, enter the following information to connect to the hive_data catalog which contains the GOSALES, ONTIME, and TAXI data. presto://ibmlhadmin:password@ibm-lh-presto-svc:8443/hive_data Enter the following information to connect to the iceberg_data catalog which will contain any tables you created when running the examples in the lab. presto://ibmlhadmin:password@ibm-lh-presto-svc:8443/iceberg_data Select the Advanced tab. Copy the following information into the security box. {\"connect_args\":{\"protocol\":\"https\",\"requests_kwargs\":{\"verify\":\"/tmp/lh-ssl-ts.crt\"}}} Press the Connect button to create the connection. Create reports/charts/dashboards Once the connection has been tested and created for watsonx.data, we can click on Dataset and create a new dataset based on the customer table in the tiny schema. Reports/dashboards can then be created using the very intuitive Superset interface. Note : The Apache Superset team removes, inserts and updates charts on a frequent basis with no advance notification. The example you see below may not be exactly the same when you run the code. This is not something that we can control in the demonstration environment. Select Datasets at the top of the Apache Superset window. Press [+ DATASET]. In the Database field, select Presto. The schemas will take a few seconds to load. Select the workshop schema. Select customer from the list. The display will show the columns associated with this table. On the bottom right-hand corner is a button named CREATE DATASET AND CREATE CHART. Press that to display the following panel. To create a simple Bar Chart, we start by selecting the Bar Chart icon. If you click it once it displays information about the chart type. If you double-click it, the chart builder screen will display. Click on the mktsegment field and drag it into the DIMENSIONS field. Then drag the acctbal field into the METRICS field. The program will ask how the field is to be computed. Select AVG from the list and SAVE. Now press the CREATE CHART button found at the bottom of the screen. Try to create different charts/dashboards if you have time. Note : When you are finished using Apache Superset, press CTRL-C (Control-C) in the terminal window that you used to start it. This will stop the program and release the resources it is using. If you press CTRL-C twice, it immediately kills the program, but it may lose some of the work that you may have done.","title":"Apache Superset"},{"location":"wxd-superset/#reportingdashboarding-using-apache-superset","text":"Apache Superset is not a part of watsonx.data and is only used to demonstrate the capability to connect to watsonx.data from other BI/Reporting tools. You will need to install Apache Superset as part of this lab. The Superset repository needs to be in sync with the image being downloaded, so these libraries cannot be preloaded into this development image. Open a terminal window and connect via SSH as the watsonx user. Do not connect as the root user. Clone the Apache Superset repository with the git command. This command typically takes less than 1 minute to download the code. git clone https://github.com/apache/superset.git The docker-compose-non-dev.yml file needs to be updated so that Apache Superset can access the same network that watsonx.data is using. cd ./superset cp docker-compose-non-dev.yml docker-compose-non-dev-backup.yml sed '/version: \"3.7\"/q' docker-compose-non-dev.yml > yamlfix.txt cat <<EOF >> yamlfix.txt networks: default: external: True name: ibm-lh-network EOF sed -e '1,/version: \"3.7\"/ d' docker-compose-non-dev.yml >> yamlfix.txt We update the Apache Superset code to version 2.1.0 . sed 's/\\${TAG:-latest-dev}/2.1.0/' yamlfix.txt > docker-compose-non-dev.yml Use docker-compose to start Apache Superset. docker compose -f docker-compose-non-dev.yml up The docker compose command will download the necessary code for Apache Superset and start the service. The terminal session will contain the logging information for the service. The process is running in the foreground so you will see all the messages being produced by the program. If you want to stop the service at any time you will need to press CTRL-C. If you close this terminal window at any time, the process will stop. When you see \"Init Step 4/4\", the service is ready for connections. If you have already installed Apache Superset and you stopped it, there is no need to reinstall the program. Go back to the /home/watsonx/superset directory and run the docker compose program again: Once the service is running, open your browser and navigate to the URL and port that were provided in your TechZone reservation. The credentials for Apache Superset are userid admin , Password admin .","title":"Reporting/Dashboarding using Apache Superset"},{"location":"wxd-superset/#setup-a-database-connection-to-watsonxdata","text":"Open another terminal window for this next step. Once Apache Superset has started loading examples, you can issue the following command as watsonx or root . docker cp /certs/lh-ssl-ts.crt superset_app:/tmp/lh-ssl-ts.crt In the Apache Superset console, press the Settings button on the far right and select Database connections. Then select the [+ DATABASE] option on the far-right side of the panel. \u2003 A connection dialog will display. Select Presto as the database connection type. In the SQLALCHEMY URI field, enter the following information to connect to the hive_data catalog which contains the GOSALES, ONTIME, and TAXI data. presto://ibmlhadmin:password@ibm-lh-presto-svc:8443/hive_data Enter the following information to connect to the iceberg_data catalog which will contain any tables you created when running the examples in the lab. presto://ibmlhadmin:password@ibm-lh-presto-svc:8443/iceberg_data Select the Advanced tab. Copy the following information into the security box. {\"connect_args\":{\"protocol\":\"https\",\"requests_kwargs\":{\"verify\":\"/tmp/lh-ssl-ts.crt\"}}} Press the Connect button to create the connection.","title":"Setup a Database Connection to watsonx.data"},{"location":"wxd-superset/#create-reportschartsdashboards","text":"Once the connection has been tested and created for watsonx.data, we can click on Dataset and create a new dataset based on the customer table in the tiny schema. Reports/dashboards can then be created using the very intuitive Superset interface. Note : The Apache Superset team removes, inserts and updates charts on a frequent basis with no advance notification. The example you see below may not be exactly the same when you run the code. This is not something that we can control in the demonstration environment. Select Datasets at the top of the Apache Superset window. Press [+ DATASET]. In the Database field, select Presto. The schemas will take a few seconds to load. Select the workshop schema. Select customer from the list. The display will show the columns associated with this table. On the bottom right-hand corner is a button named CREATE DATASET AND CREATE CHART. Press that to display the following panel. To create a simple Bar Chart, we start by selecting the Bar Chart icon. If you click it once it displays information about the chart type. If you double-click it, the chart builder screen will display. Click on the mktsegment field and drag it into the DIMENSIONS field. Then drag the acctbal field into the METRICS field. The program will ask how the field is to be computed. Select AVG from the list and SAVE. Now press the CREATE CHART button found at the bottom of the screen. Try to create different charts/dashboards if you have time. Note : When you are finished using Apache Superset, press CTRL-C (Control-C) in the terminal window that you used to start it. This will stop the program and release the resources it is using. If you press CTRL-C twice, it immediately kills the program, but it may lose some of the work that you may have done.","title":"Create reports/charts/dashboards"},{"location":"wxd-systemconnector/","text":"Using Presto System Connector The Presto System connector provides information and metrics about the currently running Presto cluster. You can use this function to monitor the workloads on the Presto cluster using normal SQL queries. Make sure you are the root user and in the proper development directory. cd /root/ibm-lh-dev/bin Start the Presto CLI. ./presto-cli What queries are currently running? select * from \"system\".runtime.queries limit 5; query_id | state | user | source | query | resource_group_id | queued_time_ms | analysis_time_ms | created | started | last_heartbeat | end -----------------------------+----------+------------+------------------+-------------------------------------------------------------+-------------------+----------------+------------------+-------------------------+-------------------------+-------------------------+------------------------- 20230626_182942_00007_4suid | FINISHED | ibmlhadmin | presto-cli | show tables | [global] | 0 | 33 | 2023-06-26 18:29:40.628 | 2023-06-26 18:29:40.817 | 2023-06-26 18:29:41.095 | 2023-06-26 18:29:41.118 20230626_182938_00005_4suid | FINISHED | ibmlhadmin | presto-cli | SHOW FUNCTIONS | [global] | 1 | 607 | 2023-06-26 18:29:36.718 | 2023-06-26 18:29:36.777 | 2023-06-26 18:29:37.707 | 2023-06-26 18:29:37.742 20230626_192655_00031_4suid | FINISHED | ibmlhadmin | presto-cli | show schemas | [global] | 1 | 257 | 2023-06-26 19:26:53.739 | 2023-06-26 19:26:54.043 | 2023-06-26 19:26:54.845 | 2023-06-26 19:26:54.866 20230626_183851_00018_4suid | FINISHED | ibmlhadmin | nodejs-client | select * from system.runtime.queries order by query_id desc | [global] | 1 | 27 | 2023-06-26 18:38:49.169 | 2023-06-26 18:38:49.293 | 2023-06-26 18:38:50.084 | 2023-06-26 18:38:50.121 20230626_185405_00021_4suid | FINISHED | ibmlhadmin | presto-go-client | SHOW TABLES | [global] | 0 | 56 | 2023-06-26 18:54:03.542 | 2023-06-26 18:54:03.729 | 2023-06-26 18:54:04.042 | 2023-06-26 18:54:04.041 (5 rows) What tasks make up a query and where is the task running? select * from \"system\".runtime.tasks limit 5; node_id | task_id | stage_execution_id | stage_id | query_id | state | splits | queued_splits | running_splits | completed_splits | split_scheduled_time_ms | split_cpu_time_ms | split_blocked_time_ms | raw_input_bytes | raw_input_rows | processed_input_bytes | processed_input_rows | output_bytes | output_rows | physical_written_bytes | created | start | last_heartbeat | end --------------------------------------+-----------------------------------+---------------------------------+-------------------------------+-----------------------------+----------+--------+---------------+----------------+------------------+-------------------------+-------------------+-----------------------+-----------------+----------------+-----------------------+----------------------+--------------+-------------+------------------------+-------------------------+-------------------------+-------------------------+------------------------- 17ffe5e1-affe-4339-b618-0f60723cabf4 | 20230626_194106_00035_4suid.1.0.0 | 20230626_194106_00035_4suid.1.0 | 20230626_194106_00035_4suid.1 | 20230626_194106_00035_4suid | FINISHED | 1 | 0 | 0 | 1 | 14 | 2 | 0 | 5965 | 36 | 5965 | 36 | 7269 | 36 | 0 | 2023-06-26 19:41:04.606 | 2023-06-26 19:41:04.618 | 2023-06-26 19:41:04.639 | 2023-06-26 19:41:04.665 17ffe5e1-affe-4339-b618-0f60723cabf4 | 20230626_194309_00038_4suid.1.0.0 | 20230626_194309_00038_4suid.1.0 | 20230626_194309_00038_4suid.1 | 20230626_194309_00038_4suid | FINISHED | 1 | 0 | 0 | 1 | 15 | 2 | 0 | 6125 | 37 | 6125 | 37 | 866 | 5 | 0 | 2023-06-26 19:43:07.346 | 2023-06-26 19:43:07.357 | 2023-06-26 19:43:07.385 | 2023-06-26 19:43:07.398 17ffe5e1-affe-4339-b618-0f60723cabf4 | 20230626_194106_00035_4suid.0.0.0 | 20230626_194106_00035_4suid.0.0 | 20230626_194106_00035_4suid.0 | 20230626_194106_00035_4suid | FINISHED | 16 | 0 | 0 | 16 | 60 | 1 | 440 | 7096 | 36 | 7269 | 36 | 7269 | 36 | 0 | 2023-06-26 19:41:04.611 | 2023-06-26 19:41:04.626 | 2023-06-26 19:41:04.634 | 2023-06-26 19:41:04.682 17ffe5e1-affe-4339-b618-0f60723cabf4 | 20230626_194309_00038_4suid.0.0.0 | 20230626_194309_00038_4suid.0.0 | 20230626_194309_00038_4suid.0 | 20230626_194309_00038_4suid | FINISHED | 17 | 0 | 0 | 17 | 108 | 2 | 189 | 1100 | 5 | 866 | 5 | 866 | 5 | 0 | 2023-06-26 19:43:07.356 | 2023-06-26 19:43:07.380 | 2023-06-26 19:43:07.380 | 2023-06-26 19:43:07.419 17ffe5e1-affe-4339-b618-0f60723cabf4 | 20230626_194431_00039_4suid.1.0.0 | 20230626_194431_00039_4suid.1.0 | 20230626_194431_00039_4suid.1 | 20230626_194431_00039_4suid | RUNNING | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2023-06-26 19:44:29.346 | 2023-06-26 19:44:29.352 | 2023-06-26 19:44:29.353 | NULL (5 rows) Quit Presto. quit;","title":"System Connector"},{"location":"wxd-systemconnector/#using-presto-system-connector","text":"The Presto System connector provides information and metrics about the currently running Presto cluster. You can use this function to monitor the workloads on the Presto cluster using normal SQL queries. Make sure you are the root user and in the proper development directory. cd /root/ibm-lh-dev/bin Start the Presto CLI. ./presto-cli What queries are currently running? select * from \"system\".runtime.queries limit 5; query_id | state | user | source | query | resource_group_id | queued_time_ms | analysis_time_ms | created | started | last_heartbeat | end -----------------------------+----------+------------+------------------+-------------------------------------------------------------+-------------------+----------------+------------------+-------------------------+-------------------------+-------------------------+------------------------- 20230626_182942_00007_4suid | FINISHED | ibmlhadmin | presto-cli | show tables | [global] | 0 | 33 | 2023-06-26 18:29:40.628 | 2023-06-26 18:29:40.817 | 2023-06-26 18:29:41.095 | 2023-06-26 18:29:41.118 20230626_182938_00005_4suid | FINISHED | ibmlhadmin | presto-cli | SHOW FUNCTIONS | [global] | 1 | 607 | 2023-06-26 18:29:36.718 | 2023-06-26 18:29:36.777 | 2023-06-26 18:29:37.707 | 2023-06-26 18:29:37.742 20230626_192655_00031_4suid | FINISHED | ibmlhadmin | presto-cli | show schemas | [global] | 1 | 257 | 2023-06-26 19:26:53.739 | 2023-06-26 19:26:54.043 | 2023-06-26 19:26:54.845 | 2023-06-26 19:26:54.866 20230626_183851_00018_4suid | FINISHED | ibmlhadmin | nodejs-client | select * from system.runtime.queries order by query_id desc | [global] | 1 | 27 | 2023-06-26 18:38:49.169 | 2023-06-26 18:38:49.293 | 2023-06-26 18:38:50.084 | 2023-06-26 18:38:50.121 20230626_185405_00021_4suid | FINISHED | ibmlhadmin | presto-go-client | SHOW TABLES | [global] | 0 | 56 | 2023-06-26 18:54:03.542 | 2023-06-26 18:54:03.729 | 2023-06-26 18:54:04.042 | 2023-06-26 18:54:04.041 (5 rows) What tasks make up a query and where is the task running? select * from \"system\".runtime.tasks limit 5; node_id | task_id | stage_execution_id | stage_id | query_id | state | splits | queued_splits | running_splits | completed_splits | split_scheduled_time_ms | split_cpu_time_ms | split_blocked_time_ms | raw_input_bytes | raw_input_rows | processed_input_bytes | processed_input_rows | output_bytes | output_rows | physical_written_bytes | created | start | last_heartbeat | end --------------------------------------+-----------------------------------+---------------------------------+-------------------------------+-----------------------------+----------+--------+---------------+----------------+------------------+-------------------------+-------------------+-----------------------+-----------------+----------------+-----------------------+----------------------+--------------+-------------+------------------------+-------------------------+-------------------------+-------------------------+------------------------- 17ffe5e1-affe-4339-b618-0f60723cabf4 | 20230626_194106_00035_4suid.1.0.0 | 20230626_194106_00035_4suid.1.0 | 20230626_194106_00035_4suid.1 | 20230626_194106_00035_4suid | FINISHED | 1 | 0 | 0 | 1 | 14 | 2 | 0 | 5965 | 36 | 5965 | 36 | 7269 | 36 | 0 | 2023-06-26 19:41:04.606 | 2023-06-26 19:41:04.618 | 2023-06-26 19:41:04.639 | 2023-06-26 19:41:04.665 17ffe5e1-affe-4339-b618-0f60723cabf4 | 20230626_194309_00038_4suid.1.0.0 | 20230626_194309_00038_4suid.1.0 | 20230626_194309_00038_4suid.1 | 20230626_194309_00038_4suid | FINISHED | 1 | 0 | 0 | 1 | 15 | 2 | 0 | 6125 | 37 | 6125 | 37 | 866 | 5 | 0 | 2023-06-26 19:43:07.346 | 2023-06-26 19:43:07.357 | 2023-06-26 19:43:07.385 | 2023-06-26 19:43:07.398 17ffe5e1-affe-4339-b618-0f60723cabf4 | 20230626_194106_00035_4suid.0.0.0 | 20230626_194106_00035_4suid.0.0 | 20230626_194106_00035_4suid.0 | 20230626_194106_00035_4suid | FINISHED | 16 | 0 | 0 | 16 | 60 | 1 | 440 | 7096 | 36 | 7269 | 36 | 7269 | 36 | 0 | 2023-06-26 19:41:04.611 | 2023-06-26 19:41:04.626 | 2023-06-26 19:41:04.634 | 2023-06-26 19:41:04.682 17ffe5e1-affe-4339-b618-0f60723cabf4 | 20230626_194309_00038_4suid.0.0.0 | 20230626_194309_00038_4suid.0.0 | 20230626_194309_00038_4suid.0 | 20230626_194309_00038_4suid | FINISHED | 17 | 0 | 0 | 17 | 108 | 2 | 189 | 1100 | 5 | 866 | 5 | 866 | 5 | 0 | 2023-06-26 19:43:07.356 | 2023-06-26 19:43:07.380 | 2023-06-26 19:43:07.380 | 2023-06-26 19:43:07.419 17ffe5e1-affe-4339-b618-0f60723cabf4 | 20230626_194431_00039_4suid.1.0.0 | 20230626_194431_00039_4suid.1.0 | 20230626_194431_00039_4suid.1 | 20230626_194431_00039_4suid | RUNNING | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2023-06-26 19:44:29.346 | 2023-06-26 19:44:29.352 | 2023-06-26 19:44:29.353 | NULL (5 rows) Quit Presto. quit;","title":"Using Presto System Connector"},{"location":"wxd-timetravel/","text":"Time Travel Time travel allows you change the view of the data to a previous time. This is not the same as an AS OF query commonly used in SQL. The data is rolled back to a prior time. Let us look at the snapshots available for the customer table in the workshop schema. We currently have just 1 snapshot. First make sure you are in the proper directory. cd /root/ibm-lh-dev/bin Connect to Presto using the workshop schema. ./presto-cli --catalog iceberg_data --schema workshop Check current snapshots \u2013 STARTING STATE. SELECT * FROM iceberg_data.workshop.\"customer$snapshots\" ORDER BY committed_at; committed_at | snapshot_id | parent_id | operation | manifest_list | summary -----------------------------+---------------------+-----------+-----------+------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 2023-06-05 18:30:12.994 UTC | 6243511110201494487 | NULL | append | s3a://iceberg-bucket/customer/metadata/snap-6243511110201494487-1-b5ab84dc-671a-426a-a734-940baa49a11f.avro | {changed-partition-count=1, added-data-files=1, total-equality-deletes=0, added-records=1500, total-position-deletes=0, added-files-size=75240, total-delete-files=0, total-files-size=75240, total-records=1500, total-data-files=1} (1 row) Capture the first snapshot ID returned by the SQL statement. You will need this value when you run the rollback command. SELECT snapshot_id FROM iceberg_data.workshop.\"customer$snapshots\" ORDER BY committed_at; snapshot_id --------------------- 6243511110201494487 (1 row) Remember that number that was returned with the query above. Insert the following record to change the customer table in the workshop schema. insert into customer values(1501,'Deepak','IBM SVL',16,'123-212-3455', 123,'AUTOMOBILE','Testing snapshots'); \u2003 Let us look at the snapshots available for the customer table in the workshop schema. You should have 2 snapshots. SELECT * FROM iceberg_data.workshop.\"customer$snapshots\" ORDER BY committed_at; committed_at | snapshot_id | parent_id | operation | manifest_list | summary -----------------------------+---------------------+---------------------+-----------+------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 2023-06-05 18:30:12.994 UTC | 6243511110201494487 | NULL | append | s3a://iceberg-bucket/customer/metadata/snap-6243511110201494487-1-b5ab84dc-671a-426a-a734-940baa49a11f.avro | {changed-partition-count=1, added-data-files=1, total-equality-deletes=0, added-records=1500, total-position-deletes=0, added-files-size=75240, total-delete-files=0, total-files-size=75240, total-records=1500, total-data-files=1} 2023-06-05 18:52:49.193 UTC | 7110570704088319509 | 6243511110201494487 | append | s3a://iceberg-bucket/customer/metadata/snap-7110570704088319509-1-ef26bcf1-c122-4ea4-86b7-ba26369be374.avro | {changed-partition-count=1, added-data-files=1, total-equality-deletes=0, added-records=1, total-position-deletes=0, added-files-size=1268, total-delete-files=0, total-files-size=76508, total-records=1501, total-data-files=2} (2 rows) Querying the customer table in the workshop schema, we can see the record inserted with name=\u2019Deepak\u2019. select * from customer where name='Deepak'; custkey | name | address | nationkey | phone | acctbal | mktsegment | comment ---------+--------+---------+-----------+--------------+---------+------------+------------------- 1501 | Deepak | IBM SVL | 16 | 123-212-3455 | 123.0 | AUTOMOBILE | Testing snapshots (1 row) We realize that we don\u2019t want the recent updates or just want to see what the data was at any point in time to respond to regulatory requirements. We will leverage the out-of-box system function rollback_to_snapshot to rollback to an older snapshot. The syntax for this function is: CALL iceberg_data.system.rollback_to_snapshot('workshop','customer',x); The \"x\" would get replaced with the snapshot_id number that was found in the earlier query. It will be different on your system than the examples above. Copy the next code segment into Presto. CALL iceberg_data.system.rollback_to_snapshot('workshop','customer', You will see output similar to the following: CALL iceberg_data.system.rollback_to_snapshot('workshop','customer', -> At this point you will need to copy and paste your snapshot_id into the Presto command line and press return or enter. You will see following: CALL iceberg_data.system.rollback_to_snapshot('workshop','customer', -> 7230522396120575591 7230522396120575591 Now you will need to terminate the command with a ); to see the final result. ); CALL iceberg_data.system.rollback_to_snapshot('workshop','customer', -> 7230522396120575591 7230522396120575591 -> ); ); CALL Querying the customer table in the workshop schema, we cannot see the record inserted with name=\u2019Deepak\u2019. select * from customer where name='Deepak'; custkey | name | address | nationkey | phone | acctbal | mktsegment | comment ---------+--------+---------+-----------+--------------+---------+------------+------------------- (0 rows) Quit Presto. quit;","title":"Time Travel"},{"location":"wxd-timetravel/#time-travel","text":"Time travel allows you change the view of the data to a previous time. This is not the same as an AS OF query commonly used in SQL. The data is rolled back to a prior time. Let us look at the snapshots available for the customer table in the workshop schema. We currently have just 1 snapshot. First make sure you are in the proper directory. cd /root/ibm-lh-dev/bin Connect to Presto using the workshop schema. ./presto-cli --catalog iceberg_data --schema workshop Check current snapshots \u2013 STARTING STATE. SELECT * FROM iceberg_data.workshop.\"customer$snapshots\" ORDER BY committed_at; committed_at | snapshot_id | parent_id | operation | manifest_list | summary -----------------------------+---------------------+-----------+-----------+------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 2023-06-05 18:30:12.994 UTC | 6243511110201494487 | NULL | append | s3a://iceberg-bucket/customer/metadata/snap-6243511110201494487-1-b5ab84dc-671a-426a-a734-940baa49a11f.avro | {changed-partition-count=1, added-data-files=1, total-equality-deletes=0, added-records=1500, total-position-deletes=0, added-files-size=75240, total-delete-files=0, total-files-size=75240, total-records=1500, total-data-files=1} (1 row) Capture the first snapshot ID returned by the SQL statement. You will need this value when you run the rollback command. SELECT snapshot_id FROM iceberg_data.workshop.\"customer$snapshots\" ORDER BY committed_at; snapshot_id --------------------- 6243511110201494487 (1 row) Remember that number that was returned with the query above. Insert the following record to change the customer table in the workshop schema. insert into customer values(1501,'Deepak','IBM SVL',16,'123-212-3455', 123,'AUTOMOBILE','Testing snapshots'); \u2003 Let us look at the snapshots available for the customer table in the workshop schema. You should have 2 snapshots. SELECT * FROM iceberg_data.workshop.\"customer$snapshots\" ORDER BY committed_at; committed_at | snapshot_id | parent_id | operation | manifest_list | summary -----------------------------+---------------------+---------------------+-----------+------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 2023-06-05 18:30:12.994 UTC | 6243511110201494487 | NULL | append | s3a://iceberg-bucket/customer/metadata/snap-6243511110201494487-1-b5ab84dc-671a-426a-a734-940baa49a11f.avro | {changed-partition-count=1, added-data-files=1, total-equality-deletes=0, added-records=1500, total-position-deletes=0, added-files-size=75240, total-delete-files=0, total-files-size=75240, total-records=1500, total-data-files=1} 2023-06-05 18:52:49.193 UTC | 7110570704088319509 | 6243511110201494487 | append | s3a://iceberg-bucket/customer/metadata/snap-7110570704088319509-1-ef26bcf1-c122-4ea4-86b7-ba26369be374.avro | {changed-partition-count=1, added-data-files=1, total-equality-deletes=0, added-records=1, total-position-deletes=0, added-files-size=1268, total-delete-files=0, total-files-size=76508, total-records=1501, total-data-files=2} (2 rows) Querying the customer table in the workshop schema, we can see the record inserted with name=\u2019Deepak\u2019. select * from customer where name='Deepak'; custkey | name | address | nationkey | phone | acctbal | mktsegment | comment ---------+--------+---------+-----------+--------------+---------+------------+------------------- 1501 | Deepak | IBM SVL | 16 | 123-212-3455 | 123.0 | AUTOMOBILE | Testing snapshots (1 row) We realize that we don\u2019t want the recent updates or just want to see what the data was at any point in time to respond to regulatory requirements. We will leverage the out-of-box system function rollback_to_snapshot to rollback to an older snapshot. The syntax for this function is: CALL iceberg_data.system.rollback_to_snapshot('workshop','customer',x); The \"x\" would get replaced with the snapshot_id number that was found in the earlier query. It will be different on your system than the examples above. Copy the next code segment into Presto. CALL iceberg_data.system.rollback_to_snapshot('workshop','customer', You will see output similar to the following: CALL iceberg_data.system.rollback_to_snapshot('workshop','customer', -> At this point you will need to copy and paste your snapshot_id into the Presto command line and press return or enter. You will see following: CALL iceberg_data.system.rollback_to_snapshot('workshop','customer', -> 7230522396120575591 7230522396120575591 Now you will need to terminate the command with a ); to see the final result. ); CALL iceberg_data.system.rollback_to_snapshot('workshop','customer', -> 7230522396120575591 7230522396120575591 -> ); ); CALL Querying the customer table in the workshop schema, we cannot see the record inserted with name=\u2019Deepak\u2019. select * from customer where name='Deepak'; custkey | name | address | nationkey | phone | acctbal | mktsegment | comment ---------+--------+---------+-----------+--------------+---------+------------+------------------- (0 rows) Quit Presto. quit;","title":"Time Travel"},{"location":"wxd-troubleshooting/","text":"Troubleshooting watsonx.data Although we have tried to make the lab as error-free as possible, occasionally things will go wrong. Here is a list of common questions, problems, and potential solutions. What are the passwords for the services I Can't Open up a Terminal Window with VNC or Guacamole A SQL Statement failed but there are no error messages Apache Superset isn't Starting Apache Superset screens differ from the lab Too many incorrect logins using VNC and now I'm blocked Presto doesn't appear to be working Displaying Db2 Schema is failing Queries are failing with a 400 code Queries are failing with a 200 or 500 code Queries are failing with memory errors SSH, VNC and watsonx.data UI are not working No access to Presto/Minio UI after restart Firefox and Chrome freeze when connecting to MinIO What are the passwords for the services? See the section on Passwords . You can get all passwords for the system when you are logged in as the watsonx user by using the following command. cat /certs/passwords You can also use the Jupyter notebook link to display the userids and passwords for the services. I Can't Open up a Terminal Window with VNC or Guacamole First thing to remember is that you can't use VNC and the TechZone VM Remote Console (Guacamole) interface at the same time. Only one can be active at a time. If you can't use terminal windows in VNC If you find that the terminal icons \"spins\" inside the VNC window, this is caused by attempting to connect to the virtual machine by using the VM Remote Console button in your reservation details screen. To fix this problem, you must log out of the VNC session (top right corner of the Linux desktop - press the power button and choose logout). Once VNC logs back in you will be able use the terminal window. A SQL Statement failed, but there are no error messages You need to use the Presto console and search for the SQL statement. Click on the Query ID to find more details of the statement execution and scroll to the bottom of the web page to see any error details. Apache Superset isn't Starting If Superset doesn't start for some reason, you will need to reset it completely to try it again. First make sure you are connected as the watsonx user not root . Make sure you have stopped the terminal session that is running Apache Superset. Next remove the Apache Superset directory. sudo rm -rf /home/watsonx/superset We remove the docker images associated with Apache Superset. If no containers or volumes exist you will get an error message. docker ps -a -q --filter \"name=superset\" | xargs docker container rm --force docker volume list -q --filter \"name=superset\" | xargs docker volume rm --force Download the superset code again. git clone https://github.com/apache/superset.git The docker-compose-non-dev.yml file needs to be updated so that Apache Superset can access the same network that watsonx.data is using. cd ./superset cp docker-compose-non-dev.yml docker-compose-non-dev-backup.yml sed '/version: \"3.7\"/q' docker-compose-non-dev.yml > yamlfix.txt cat <<EOF >> yamlfix.txt networks: default: external: True name: ibm-lh-network EOF sed -e '1,/version: \"3.7\"/ d' docker-compose-non-dev.yml >> yamlfix.txt We update the Apache Superset code to version 2.1.0 . sed 's/\\${TAG:-latest-dev}/2.1.0/' yamlfix.txt > docker-compose-non-dev.yml Use docker-compose to start Apache Superset. nohup docker compose -f docker-compose-non-dev.yml up & The nohup command will issue a message indicating that output will be directed to the nohup.out file. It takes some time for the service to start, so be patient! You can view any output from the Apache Superset system by viewing the nohup.out file in the directory where you installed superset. Apache Superset screens differ from the lab The Apache Superset project makes frequent changes to the types of charts that are available. In some cases they remove or merge charts. Since these charts changes are dynamic, we are not able to guarantee that our examples will look the same as what you might have on your system. Presto doesn't appear to be working If you find that the watsonx.data UI is generating error messages that suggest that queries are not running, or that the Presto service is dead, you can force a soft restart of Presto with the following command: docker restart ibm-lh-presto This will restart the Presto server. If you find that does not fix your problem, you will need to do a hard reset using the following commands: sudo su - cd /root/ibm-lh-dev/bin ./stop_service ibm-lh-presto ./start_service ibm-lh-presto check_presto The command will wait until the service is running before exiting. Displaying Db2 Schema is failing Occasionally when attempting to expand the Db2 catalog (schema), the watsonx.data UI will not display any data or issue an error message. You can try refreshing the browser (not the refresh icon inside the UI) and try again. If you find that this is failing again, open the Query workspace and run the following SQL (replace db2_gosales with the name you cataloged the database with). select count(*) from db2_gosales.gosalesdw.go_org_dim The result should be 123 and hopefully the tables that are part of the schema will display for you. Queries are failing with a 400 code The watsonx.data UI will log you out after a period of inactivity, but doesn't tell you that this has happened. When you attempt to run a query, the error that is returned (400) indicates that you need to log back in again. Queries are failing with a 200 or 500 code A 500 code may indicate the watsonx.data UI has a problem connecting with the Presto engine. First log out of the console and trying logging back on. If that fails to solve the problem, you will need to reboot the console. Open up a terminal window into the server: As the root user, restart the docker container that is running the watsonx.data UI. docker restart lhconsole-nodeclient-svc Queries fail become of insufficient memory If you are running a complex query, you may get an error message similar to \"Query exceeded per-node user memory limit\" or a something similar. Watsonx.data (Presto) attempts to limit the amount of resources being using in a query and will stop a query if it exceeds a certain threshold. You can change the behavior of the system by making the following changes. Note : During this step you will disconnect anyone running a query on the server. What you need to do is make a change to the configuration settings of the Presto engine. AS the root user, enter the docker container for the presto engine: docker exec -it ibm-lh-presto /bin/bash Next, copy the original config file to a safe place in case we make an error: cp /opt/presto/etc/config.properties /opt/presto/etc/config.properties.backup Then update the properties file. cat >> /opt/presto/etc/config.properties << EOL experimental.spiller-spill-path=/tmp experimental.spiller-max-used-space-threshold=0.7 experimental.max-spill-per-node=10GB experimental.query-max-spill-per-node=10GB experimental.spill-enabled=true query.max-memory=10GB query.max-memory-per-node=10GB query.max-total-memory-per-node=10GB query.max-total-memory=10GB EOL Doublecheck that it worked. cat /opt/presto/etc/config.properties | grep experimental experimental.max-spill-per-node=10GB experimental.query-max-spill-per-node=10GB experimental.spill-enabled=true experimental.spiller-max-used-space-threshold=0.7 experimental.spiller-spill-path=/tmp If it is all good then exit the container. exit And now we restart the container. Make sure that you don't impact other users! docker restart ibm-lh-presto Now try running your query again. Note : Once you make this change, only restart presto using the above command, otherwise you will lose the changes. Too many incorrect logins using VNC and now I'm blocked from connecting If you lock yourself out of VNC because of too many incorrect logins, you can reset the service with the following commands. Connect as the root user then run the following command and you should be able to log in again. systemctl restart vncserver@:1 exit SSH, VNC and watsonx.data UI are not working Symptoms: You've tried to use SSH to log into the system, and you get a timeout error. All the Web-based UIs (watsonx.data UI, Presto) fail. Find your email message that contains details of your reservation. Details of what the reservations and the page containing details of the reservation can be found in the Accessing the reservation section. Once the details appear, scroll down to the bottom of the web page, and you will see the VM Remote Console button. You can access the logon screen of the virtual machine by pressing the VM Remote Console button. Clicking on this button will display the logon screen for the server. If you see this screen, the system is running and there is something wrong the watsonx.data service (see instructions below). If you see the following screen: This means your server has been turned off. Click on the Power on button. Make sure to press the Yes button to turn the power on! In a few minutes you should see the logon screen again. Wait for a few minutes for all the services to start, and then you will be able to use SSH, VNC, and watsonx.data UI. Reset watsonx.data If you can log into the watsonx userid using the VM Remove console, you can reset the watsonx.data server with the following steps. SSH into the server as the root user. Then switch to the development code bin directory. cd /root/ibm-lh-dev/bin Check the status of the system with the following command. ./status.sh --all Output will look similar to: using /root/ibm-lh-dev/localstorage/volumes as data root directory for user: root/1001 infra config location is /root/ibm-lh-dev/localstorage/volumes/infra lhconsole-ui running 0.0.0.0:9443->8443/tcp, :::9443->8443/tcp lhconsole-nodeclient-svc running 3001/tcp lhconsole-javaapi-svc running 8090/tcp lhconsole-api running 3333/tcp, 8081/tcp ibm-lh-presto running 0.0.0.0:8443->8443/tcp, :::8443->8443/tcp ibm-lh-hive-metastore running ibm-lh-postgres running 5432/tcp ibm-lh-minio running If the any of the services are not running, you will need to restart the system with the following set of commands. cd /root/ibm-lh-dev/bin ./stop.sh export LH_RUN_MODE=diag ./start.sh Wait for all services to start and then check to see if you can connect to the watsonx.data UI. No access to Presto/Minio UI after restart If you are using a TechZone image that has been suspended, or restarted, you may come across a situation where you are unable to connect to any service that uses the http protocol. The watsonx.service needs to have a diagnostic flag set that opens up these ports, and sometimes this diagnostic setting is not being updated. To manually stop and start the system, you will need to connect with root user privileges and run the following commands: sudo su - cd /root/ibm-lh-dev/bin ./stop.sh export LH_RUN_MODE=diag ./start.sh This set of commands will stop all the services in watsonx.data and restart them in diagnostic mode. This will now open the http ports for use. Firefox and Chrome freeze when connecting to MinIO Firefox and Chrome on OSX will occasionally freeze when connecting to the MinIO console. The Safari browser is much more reliable. This problem appears to be caused by some features which are not properly handled by these browsers.","title":"Troubleshooting"},{"location":"wxd-troubleshooting/#troubleshooting-watsonxdata","text":"Although we have tried to make the lab as error-free as possible, occasionally things will go wrong. Here is a list of common questions, problems, and potential solutions. What are the passwords for the services I Can't Open up a Terminal Window with VNC or Guacamole A SQL Statement failed but there are no error messages Apache Superset isn't Starting Apache Superset screens differ from the lab Too many incorrect logins using VNC and now I'm blocked Presto doesn't appear to be working Displaying Db2 Schema is failing Queries are failing with a 400 code Queries are failing with a 200 or 500 code Queries are failing with memory errors SSH, VNC and watsonx.data UI are not working No access to Presto/Minio UI after restart Firefox and Chrome freeze when connecting to MinIO","title":"Troubleshooting watsonx.data"},{"location":"wxd-troubleshooting/#what-are-the-passwords-for-the-services","text":"See the section on Passwords . You can get all passwords for the system when you are logged in as the watsonx user by using the following command. cat /certs/passwords You can also use the Jupyter notebook link to display the userids and passwords for the services.","title":"What are the passwords for the services?"},{"location":"wxd-troubleshooting/#i-cant-open-up-a-terminal-window-with-vnc-or-guacamole","text":"First thing to remember is that you can't use VNC and the TechZone VM Remote Console (Guacamole) interface at the same time. Only one can be active at a time.","title":"I Can't Open up a Terminal Window with VNC or Guacamole"},{"location":"wxd-troubleshooting/#if-you-cant-use-terminal-windows-in-vnc","text":"If you find that the terminal icons \"spins\" inside the VNC window, this is caused by attempting to connect to the virtual machine by using the VM Remote Console button in your reservation details screen. To fix this problem, you must log out of the VNC session (top right corner of the Linux desktop - press the power button and choose logout). Once VNC logs back in you will be able use the terminal window.","title":"If you can't use terminal windows in VNC"},{"location":"wxd-troubleshooting/#a-sql-statement-failed-but-there-are-no-error-messages","text":"You need to use the Presto console and search for the SQL statement. Click on the Query ID to find more details of the statement execution and scroll to the bottom of the web page to see any error details.","title":"A SQL Statement failed, but there are no error messages"},{"location":"wxd-troubleshooting/#apache-superset-isnt-starting","text":"If Superset doesn't start for some reason, you will need to reset it completely to try it again. First make sure you are connected as the watsonx user not root . Make sure you have stopped the terminal session that is running Apache Superset. Next remove the Apache Superset directory. sudo rm -rf /home/watsonx/superset We remove the docker images associated with Apache Superset. If no containers or volumes exist you will get an error message. docker ps -a -q --filter \"name=superset\" | xargs docker container rm --force docker volume list -q --filter \"name=superset\" | xargs docker volume rm --force Download the superset code again. git clone https://github.com/apache/superset.git The docker-compose-non-dev.yml file needs to be updated so that Apache Superset can access the same network that watsonx.data is using. cd ./superset cp docker-compose-non-dev.yml docker-compose-non-dev-backup.yml sed '/version: \"3.7\"/q' docker-compose-non-dev.yml > yamlfix.txt cat <<EOF >> yamlfix.txt networks: default: external: True name: ibm-lh-network EOF sed -e '1,/version: \"3.7\"/ d' docker-compose-non-dev.yml >> yamlfix.txt We update the Apache Superset code to version 2.1.0 . sed 's/\\${TAG:-latest-dev}/2.1.0/' yamlfix.txt > docker-compose-non-dev.yml Use docker-compose to start Apache Superset. nohup docker compose -f docker-compose-non-dev.yml up & The nohup command will issue a message indicating that output will be directed to the nohup.out file. It takes some time for the service to start, so be patient! You can view any output from the Apache Superset system by viewing the nohup.out file in the directory where you installed superset.","title":"Apache Superset isn't Starting"},{"location":"wxd-troubleshooting/#apache-superset-screens-differ-from-the-lab","text":"The Apache Superset project makes frequent changes to the types of charts that are available. In some cases they remove or merge charts. Since these charts changes are dynamic, we are not able to guarantee that our examples will look the same as what you might have on your system.","title":"Apache Superset screens differ from the lab"},{"location":"wxd-troubleshooting/#presto-doesnt-appear-to-be-working","text":"If you find that the watsonx.data UI is generating error messages that suggest that queries are not running, or that the Presto service is dead, you can force a soft restart of Presto with the following command: docker restart ibm-lh-presto This will restart the Presto server. If you find that does not fix your problem, you will need to do a hard reset using the following commands: sudo su - cd /root/ibm-lh-dev/bin ./stop_service ibm-lh-presto ./start_service ibm-lh-presto check_presto The command will wait until the service is running before exiting.","title":"Presto doesn't appear to be working"},{"location":"wxd-troubleshooting/#displaying-db2-schema-is-failing","text":"Occasionally when attempting to expand the Db2 catalog (schema), the watsonx.data UI will not display any data or issue an error message. You can try refreshing the browser (not the refresh icon inside the UI) and try again. If you find that this is failing again, open the Query workspace and run the following SQL (replace db2_gosales with the name you cataloged the database with). select count(*) from db2_gosales.gosalesdw.go_org_dim The result should be 123 and hopefully the tables that are part of the schema will display for you.","title":"Displaying Db2 Schema is failing"},{"location":"wxd-troubleshooting/#queries-are-failing-with-a-400-code","text":"The watsonx.data UI will log you out after a period of inactivity, but doesn't tell you that this has happened. When you attempt to run a query, the error that is returned (400) indicates that you need to log back in again.","title":"Queries are failing with a 400 code"},{"location":"wxd-troubleshooting/#queries-are-failing-with-a-200-or-500-code","text":"A 500 code may indicate the watsonx.data UI has a problem connecting with the Presto engine. First log out of the console and trying logging back on. If that fails to solve the problem, you will need to reboot the console. Open up a terminal window into the server: As the root user, restart the docker container that is running the watsonx.data UI. docker restart lhconsole-nodeclient-svc","title":"Queries are failing with a 200 or 500 code"},{"location":"wxd-troubleshooting/#queries-fail-become-of-insufficient-memory","text":"If you are running a complex query, you may get an error message similar to \"Query exceeded per-node user memory limit\" or a something similar. Watsonx.data (Presto) attempts to limit the amount of resources being using in a query and will stop a query if it exceeds a certain threshold. You can change the behavior of the system by making the following changes. Note : During this step you will disconnect anyone running a query on the server. What you need to do is make a change to the configuration settings of the Presto engine. AS the root user, enter the docker container for the presto engine: docker exec -it ibm-lh-presto /bin/bash Next, copy the original config file to a safe place in case we make an error: cp /opt/presto/etc/config.properties /opt/presto/etc/config.properties.backup Then update the properties file. cat >> /opt/presto/etc/config.properties << EOL experimental.spiller-spill-path=/tmp experimental.spiller-max-used-space-threshold=0.7 experimental.max-spill-per-node=10GB experimental.query-max-spill-per-node=10GB experimental.spill-enabled=true query.max-memory=10GB query.max-memory-per-node=10GB query.max-total-memory-per-node=10GB query.max-total-memory=10GB EOL Doublecheck that it worked. cat /opt/presto/etc/config.properties | grep experimental experimental.max-spill-per-node=10GB experimental.query-max-spill-per-node=10GB experimental.spill-enabled=true experimental.spiller-max-used-space-threshold=0.7 experimental.spiller-spill-path=/tmp If it is all good then exit the container. exit And now we restart the container. Make sure that you don't impact other users! docker restart ibm-lh-presto Now try running your query again. Note : Once you make this change, only restart presto using the above command, otherwise you will lose the changes.","title":"Queries fail become of insufficient memory"},{"location":"wxd-troubleshooting/#too-many-incorrect-logins-using-vnc-and-now-im-blocked-from-connecting","text":"If you lock yourself out of VNC because of too many incorrect logins, you can reset the service with the following commands. Connect as the root user then run the following command and you should be able to log in again. systemctl restart vncserver@:1 exit","title":"Too many incorrect logins using VNC and now I'm blocked from connecting"},{"location":"wxd-troubleshooting/#ssh-vnc-and-watsonxdata-ui-are-not-working","text":"Symptoms: You've tried to use SSH to log into the system, and you get a timeout error. All the Web-based UIs (watsonx.data UI, Presto) fail. Find your email message that contains details of your reservation. Details of what the reservations and the page containing details of the reservation can be found in the Accessing the reservation section. Once the details appear, scroll down to the bottom of the web page, and you will see the VM Remote Console button. You can access the logon screen of the virtual machine by pressing the VM Remote Console button. Clicking on this button will display the logon screen for the server. If you see this screen, the system is running and there is something wrong the watsonx.data service (see instructions below). If you see the following screen: This means your server has been turned off. Click on the Power on button. Make sure to press the Yes button to turn the power on! In a few minutes you should see the logon screen again. Wait for a few minutes for all the services to start, and then you will be able to use SSH, VNC, and watsonx.data UI.","title":"SSH, VNC and watsonx.data UI are not working"},{"location":"wxd-troubleshooting/#reset-watsonxdata","text":"If you can log into the watsonx userid using the VM Remove console, you can reset the watsonx.data server with the following steps. SSH into the server as the root user. Then switch to the development code bin directory. cd /root/ibm-lh-dev/bin Check the status of the system with the following command. ./status.sh --all Output will look similar to: using /root/ibm-lh-dev/localstorage/volumes as data root directory for user: root/1001 infra config location is /root/ibm-lh-dev/localstorage/volumes/infra lhconsole-ui running 0.0.0.0:9443->8443/tcp, :::9443->8443/tcp lhconsole-nodeclient-svc running 3001/tcp lhconsole-javaapi-svc running 8090/tcp lhconsole-api running 3333/tcp, 8081/tcp ibm-lh-presto running 0.0.0.0:8443->8443/tcp, :::8443->8443/tcp ibm-lh-hive-metastore running ibm-lh-postgres running 5432/tcp ibm-lh-minio running If the any of the services are not running, you will need to restart the system with the following set of commands. cd /root/ibm-lh-dev/bin ./stop.sh export LH_RUN_MODE=diag ./start.sh Wait for all services to start and then check to see if you can connect to the watsonx.data UI.","title":"Reset watsonx.data"},{"location":"wxd-troubleshooting/#no-access-to-prestominio-ui-after-restart","text":"If you are using a TechZone image that has been suspended, or restarted, you may come across a situation where you are unable to connect to any service that uses the http protocol. The watsonx.service needs to have a diagnostic flag set that opens up these ports, and sometimes this diagnostic setting is not being updated. To manually stop and start the system, you will need to connect with root user privileges and run the following commands: sudo su - cd /root/ibm-lh-dev/bin ./stop.sh export LH_RUN_MODE=diag ./start.sh This set of commands will stop all the services in watsonx.data and restart them in diagnostic mode. This will now open the http ports for use.","title":"No access to Presto/Minio UI after restart"},{"location":"wxd-troubleshooting/#firefox-and-chrome-freeze-when-connecting-to-minio","text":"Firefox and Chrome on OSX will occasionally freeze when connecting to the MinIO console. The Safari browser is much more reliable. This problem appears to be caused by some features which are not properly handled by these browsers.","title":"Firefox and Chrome freeze when connecting to MinIO"},{"location":"wxd-useradmin/","text":"Watsonx.data User Administration and Roles Security and access control within watsonx.data are based on roles. A role is a set of privileges that control the actions that users can perform. Authorization is granted by assigning a specific role to a user, or by adding the user to a group that has been assigned one or more roles. Access control at the infrastructural level allows permissions to be granted on the engines, catalogs, buckets, and databases. Roles for these components include Admin, Manager, User, Writer, and Reader (depending on the component). Access to the data itself is managed through data control policies. Policies can be created to permit or deny access to schemas, tables, and columns. User account management and access management varies between the different deployment options for watsonx.data. For instance, in the managed cloud service (SaaS), the service owner would need to invite other users to the environment and give them appropriate service access. With the standalone software, users can be added within the console\u2019s Access control page. In the Developer Edition, users can be added using a command line tool. Credits: Portions of text found in this section were copied from Kelly Schlamb's watsonx.data L3 course. User Administration This lab is using the Developer edition of the watsonx.data software, which means that the Access control panel is not available. In order to manage users, the user-mgmt command will need to be used. The user-mgmt command is found in the /root/ibm-lh-dev/bin directory. Examples of using the command are found below. Add a User The syntax for adding a user is: ./user-mgmt add-user <username> [ User | Admin ] <password> The values are: username - The name of the user [User|Admin] - The type of user. Note that the type of user is case-sensitive! password - The password for the user. The following command will add the user watsonx with a password of watsonx.data . This will be a standard user with no privileges. The first step is to make sure you are connected as the root user in watsonx.data server and have switched to the proper directory. sudo su - cd /root/ibm-lh-dev/bin The next command will add a new user to the system. ./user-mgmt add-user watsonx User watsonx.data Change a User's Password The syntax for changing a password is: ./user-mgmt change-password <username> This command will require that the user enter the new password. You can issue the command and provide the new password at the prompt. The other way to simulate the enter command is to use the Linux yes function which repeats a value multiple times. The following command will change the password of watsonx to hellowatson . yes hellowatson | ./user-mgmt change-password watsonx Validate a User's Password You can validate a password by using the following command: ./user-mgmt test-user-cred <username> The username is the name of the user that you want to check the password for. This command will require that the user enter the existing password to check it. You can use the yes function (as described above) to simulate the enter command. The following command will check that we have changed the password of watsonx to hellowatson . yes hellowatson | ./user-mgmt test-user-cred watsonx Delete a User To delete a user, use the following command: ./user-mgmt delete-user <username> The error messages on group ownership can be safely ignored. The following command will remove our watsonx user. ./user-mgmt delete-user watsonx Roles and Policies In this section you will add a new user and provide them with privileges over the infrastructure and data. First start by adding a new user to the watsonx.data system. If you haven't already, make sure you are connected to the server as the root user and are in the /root/ibm-lh-dev/bin directory. Add user1 to the system with a password of password . ./user-mgmt add-user user1 User password Access Control To view what users are currently authorized to use the system, select the Access control icon found on the left side of the watsonx.data UI. A list of objects that make up the watsonx.data infrastructure are displayed. You can see that the objects are made up of: Engines Catalogs Buckets In a real-world scenario where a user will be querying data from a table, that user will need to be given a minimum of User access to an engine (to be able to run the query), User access for the catalog associated with the data (to be able to see the schema information associated with the table), and Reader access to the bucket associated with the data (to be able to read the data from object storage). Additionally, a policy has to be created to permit the user to access the table in question. Granting Access Select the presto-01 engine (highlighted in red above) to view the current users that have access to the engine. At this point, only the administrative user (ibmlhadmin) can use the Presto engine. Click on the Add Access button to add a new authorized user to the list. The role button has been selected in the dialog to show the role options of Admin, Manager, or User. An Admin user can grant any role to a user, while a Manager can only grant User privileges. For this lab, grant user1 \"user\" privileges and then press Add. Close the dialog by clicking on the [x] on the top right of the screen. Now user1 needs to be granted access to a catalog. In this case, the iceberg_data and hive_data catalogs are required for the Presto engine and are implicitly granted to the user. Click on the iceberg_data line to confirm that this is the case. You should see that user1 has already been granted access to the catalog. If your version of watsonx.data Developer Edition does not have access granted to user1 , add it manually using the same steps as for engine access. Close the dialog to return to the Access control screen. The final step is to grant access to the underlying buckets. Because user was implicitly granted access to the iceberg_data and hive_data catalogs, the underlying buckets iceberg-bucket and hive-bucket were also added to their access list. Click on iceberg-bucket to view the access control. Again, if user1 does not have access to the bucket, add access using the same steps as adding engine access. When done viewing the screen, close the dialog to return to the Access control dialog. Policies After access has been granted to engines, catalogs, and buckets, the final step is to create a policy to grant access to tables. Click on the Policy tab to display the current policies in place (there should be none). The Add Policy button is highlighted on the far right side of the screen. Pressing the button will display the new Access Control Policy dialog. Fill in the following information: Policy name: selectflights Description: blank Policy status: active You can always activate a policy after you have created it. Click Next. Here we need to select which schemas that the user will be able to access. For this example, select the ontime schema. After selecting the schema, a set of tables associated with the schema will be listed. You can choose which tables can be searched. If you choose an individual table, you can restrict which columns can be searched. Select the ontime table and then select the following columns (you will need to scroll down the page): flightdate reporting_airline flight_number_reporting_airline originairportid destairportid Once you have selected the columns, press the Next button to display the Rules dialog. The Rules dialog allows you to fine-tune what actions can be done by a user against the data. Press the Add Rule button to display the Add Rule dialog. Rules can be used to Allow actions or to Deny actions. In our example, we want to allow user1 to SELECT data from the data, but with no other options. Note : In production versions of watsonx.data, you can provide access to a group which makes it simpler to create a set of rules that apply to a group and then add a user to a group. That way a user will inherit the rules that were applied to the group rather than having to create rules for that individual. The developer edition displays GROUP as an option, but it is not implemented. Once the fields are filled in, press the Add button. You can continue to add additional rules to the list. Since we only want the one rule, select the Review button. Once you have confirmed that the data objects and rules look correct, press the Save button. The selectflights policy is now in place and is actively enforced. Before testing the policy enforcement, use the SQL icon on the left side to navigate to the hive_data catalog and view the schemas that are associated with it. Expand the ontime schema to view the tables and the columns that are available in the ontime table. When you connect as user1 , you will be able to compare what objects can be accessed from that userid. Testing Policy Enforcement To test whether the rules are enforced, you will need to log out of the current watsonx.data UI session. At the top of the Access Control screen, you will see the user icon on the top right. Clicking on the icon will display the logout dialog. Log out to the main watsonx.data UI screen. You will be prompted to confirm log out. Once back to the main log in panel, enter user and password into the dialog. Your watsonx.data UI should now display user1 . Navigate to the SQL icon and then select hive_data -> ontime -> ontime . You should notice that user1 was restricted to seeing only the ontime schema in the hive_data catalog. In addition, the user was restricted to accessing one of the tables ( ontime ) and 5 columns. Attempting to run a SELECT statement against all the data will result in a policy error. Correcting the SQL to include only permitted columns results in an answer set. The policy rules have been enforced for user1 , preventing them from viewing any other schemas or tables in the system. In addition, the SQL that they could execute was restricted to specific columns in the table. Before moving onto any other sections, make sure to log out as user1 and reconnect as the ibmlhadmin user.","title":"User Administration and Roles"},{"location":"wxd-useradmin/#watsonxdata-user-administration-and-roles","text":"Security and access control within watsonx.data are based on roles. A role is a set of privileges that control the actions that users can perform. Authorization is granted by assigning a specific role to a user, or by adding the user to a group that has been assigned one or more roles. Access control at the infrastructural level allows permissions to be granted on the engines, catalogs, buckets, and databases. Roles for these components include Admin, Manager, User, Writer, and Reader (depending on the component). Access to the data itself is managed through data control policies. Policies can be created to permit or deny access to schemas, tables, and columns. User account management and access management varies between the different deployment options for watsonx.data. For instance, in the managed cloud service (SaaS), the service owner would need to invite other users to the environment and give them appropriate service access. With the standalone software, users can be added within the console\u2019s Access control page. In the Developer Edition, users can be added using a command line tool. Credits: Portions of text found in this section were copied from Kelly Schlamb's watsonx.data L3 course.","title":"Watsonx.data User Administration and Roles"},{"location":"wxd-useradmin/#user-administration","text":"This lab is using the Developer edition of the watsonx.data software, which means that the Access control panel is not available. In order to manage users, the user-mgmt command will need to be used. The user-mgmt command is found in the /root/ibm-lh-dev/bin directory. Examples of using the command are found below.","title":"User Administration"},{"location":"wxd-useradmin/#add-a-user","text":"The syntax for adding a user is: ./user-mgmt add-user <username> [ User | Admin ] <password> The values are: username - The name of the user [User|Admin] - The type of user. Note that the type of user is case-sensitive! password - The password for the user. The following command will add the user watsonx with a password of watsonx.data . This will be a standard user with no privileges. The first step is to make sure you are connected as the root user in watsonx.data server and have switched to the proper directory. sudo su - cd /root/ibm-lh-dev/bin The next command will add a new user to the system. ./user-mgmt add-user watsonx User watsonx.data","title":"Add a User"},{"location":"wxd-useradmin/#change-a-users-password","text":"The syntax for changing a password is: ./user-mgmt change-password <username> This command will require that the user enter the new password. You can issue the command and provide the new password at the prompt. The other way to simulate the enter command is to use the Linux yes function which repeats a value multiple times. The following command will change the password of watsonx to hellowatson . yes hellowatson | ./user-mgmt change-password watsonx","title":"Change a User's Password"},{"location":"wxd-useradmin/#validate-a-users-password","text":"You can validate a password by using the following command: ./user-mgmt test-user-cred <username> The username is the name of the user that you want to check the password for. This command will require that the user enter the existing password to check it. You can use the yes function (as described above) to simulate the enter command. The following command will check that we have changed the password of watsonx to hellowatson . yes hellowatson | ./user-mgmt test-user-cred watsonx","title":"Validate a User's Password"},{"location":"wxd-useradmin/#delete-a-user","text":"To delete a user, use the following command: ./user-mgmt delete-user <username> The error messages on group ownership can be safely ignored. The following command will remove our watsonx user. ./user-mgmt delete-user watsonx","title":"Delete a User"},{"location":"wxd-useradmin/#roles-and-policies","text":"In this section you will add a new user and provide them with privileges over the infrastructure and data. First start by adding a new user to the watsonx.data system. If you haven't already, make sure you are connected to the server as the root user and are in the /root/ibm-lh-dev/bin directory. Add user1 to the system with a password of password . ./user-mgmt add-user user1 User password","title":"Roles and Policies"},{"location":"wxd-useradmin/#access-control","text":"To view what users are currently authorized to use the system, select the Access control icon found on the left side of the watsonx.data UI. A list of objects that make up the watsonx.data infrastructure are displayed. You can see that the objects are made up of: Engines Catalogs Buckets In a real-world scenario where a user will be querying data from a table, that user will need to be given a minimum of User access to an engine (to be able to run the query), User access for the catalog associated with the data (to be able to see the schema information associated with the table), and Reader access to the bucket associated with the data (to be able to read the data from object storage). Additionally, a policy has to be created to permit the user to access the table in question.","title":"Access Control"},{"location":"wxd-useradmin/#granting-access","text":"Select the presto-01 engine (highlighted in red above) to view the current users that have access to the engine. At this point, only the administrative user (ibmlhadmin) can use the Presto engine. Click on the Add Access button to add a new authorized user to the list. The role button has been selected in the dialog to show the role options of Admin, Manager, or User. An Admin user can grant any role to a user, while a Manager can only grant User privileges. For this lab, grant user1 \"user\" privileges and then press Add. Close the dialog by clicking on the [x] on the top right of the screen. Now user1 needs to be granted access to a catalog. In this case, the iceberg_data and hive_data catalogs are required for the Presto engine and are implicitly granted to the user. Click on the iceberg_data line to confirm that this is the case. You should see that user1 has already been granted access to the catalog. If your version of watsonx.data Developer Edition does not have access granted to user1 , add it manually using the same steps as for engine access. Close the dialog to return to the Access control screen. The final step is to grant access to the underlying buckets. Because user was implicitly granted access to the iceberg_data and hive_data catalogs, the underlying buckets iceberg-bucket and hive-bucket were also added to their access list. Click on iceberg-bucket to view the access control. Again, if user1 does not have access to the bucket, add access using the same steps as adding engine access. When done viewing the screen, close the dialog to return to the Access control dialog.","title":"Granting Access"},{"location":"wxd-useradmin/#policies","text":"After access has been granted to engines, catalogs, and buckets, the final step is to create a policy to grant access to tables. Click on the Policy tab to display the current policies in place (there should be none). The Add Policy button is highlighted on the far right side of the screen. Pressing the button will display the new Access Control Policy dialog. Fill in the following information: Policy name: selectflights Description: blank Policy status: active You can always activate a policy after you have created it. Click Next. Here we need to select which schemas that the user will be able to access. For this example, select the ontime schema. After selecting the schema, a set of tables associated with the schema will be listed. You can choose which tables can be searched. If you choose an individual table, you can restrict which columns can be searched. Select the ontime table and then select the following columns (you will need to scroll down the page): flightdate reporting_airline flight_number_reporting_airline originairportid destairportid Once you have selected the columns, press the Next button to display the Rules dialog. The Rules dialog allows you to fine-tune what actions can be done by a user against the data. Press the Add Rule button to display the Add Rule dialog. Rules can be used to Allow actions or to Deny actions. In our example, we want to allow user1 to SELECT data from the data, but with no other options. Note : In production versions of watsonx.data, you can provide access to a group which makes it simpler to create a set of rules that apply to a group and then add a user to a group. That way a user will inherit the rules that were applied to the group rather than having to create rules for that individual. The developer edition displays GROUP as an option, but it is not implemented. Once the fields are filled in, press the Add button. You can continue to add additional rules to the list. Since we only want the one rule, select the Review button. Once you have confirmed that the data objects and rules look correct, press the Save button. The selectflights policy is now in place and is actively enforced. Before testing the policy enforcement, use the SQL icon on the left side to navigate to the hive_data catalog and view the schemas that are associated with it. Expand the ontime schema to view the tables and the columns that are available in the ontime table. When you connect as user1 , you will be able to compare what objects can be accessed from that userid.","title":"Policies"},{"location":"wxd-useradmin/#testing-policy-enforcement","text":"To test whether the rules are enforced, you will need to log out of the current watsonx.data UI session. At the top of the Access Control screen, you will see the user icon on the top right. Clicking on the icon will display the logout dialog. Log out to the main watsonx.data UI screen. You will be prompted to confirm log out. Once back to the main log in panel, enter user and password into the dialog. Your watsonx.data UI should now display user1 . Navigate to the SQL icon and then select hive_data -> ontime -> ontime . You should notice that user1 was restricted to seeing only the ontime schema in the hive_data catalog. In addition, the user was restricted to accessing one of the tables ( ontime ) and 5 columns. Attempting to run a SELECT statement against all the data will result in a policy error. Correcting the SQL to include only permitted columns results in an answer set. The policy rules have been enforced for user1 , preventing them from viewing any other schemas or tables in the system. In addition, the SQL that they could execute was restricted to specific columns in the table. Before moving onto any other sections, make sure to log out as user1 and reconnect as the ibmlhadmin user.","title":"Testing Policy Enforcement"},{"location":"wxd-vmware/","text":"IBM watsonx.data VMware Image The IBM watsonx.data lab can be run in a virtual machine environment using VMWare Workstation, VMWare Fusion, or Oracle VirtualBox. The location of the OVA file (a compressed OS image format) is provided in the TechZone page for the lab: https://techzone.ibm.com/collection/ibm-watsonxdata-developer-base-image Select the resources tab to get details on how to download the file. Download the watsonxdata.ova file onto your local machine and then use the import function of VMware or VirtualBox to register it with the system. Note : This virtual machine was created using X64 (Intel) hardware, so this will not work in an OSX environment using M1/M2 chips. Once the machine is imported you can delete the OVA file. Before starting the machine, you may want to adjust the hardware requirements. vCPUs \u2013 4 VPCs minimum Memory \u2013 16Gb minimum (You can try 12Gb but tight!) Disk \u2013 30Gb initial size, but the image will grow in size Disable side channel mitigation ON (VMware only) VMware URLs All the URLs in the lab use 192.168.252.2 as the host. When running in the VMware image, you must use localhost for the addresses. You must substitute localhost for the 192.168.252.2 address when you come across it in the documentation. The following URLs and Ports are used to access the watsonx.data services. The ports that are used in the lab are listed below. https://localhost:9443 - watsonx.data management console http://localhost:8080 - Presto console http://localhost:9001 - MinIO console (S3 buckets) https://localhost:6443 - Portainer (Docker container management) http://localhost:8088 - Apache Superset (Query and Graphing) 8443 - Presto External Port 5432 - Postgres External Port 50000 - Db2 Database Port The Apache Superset link will not be active until started as part of the lab. These links have been placed into the Firefox browser for your convenience. Starting the VMware Image When the machine starts, you will be prompted with the logon screen. There are two userids that we will be using in the VMware image: root \u2013 password watsonx.data watsonx \u2013 password watsonx.data When successfully logged in you should see the following screen. Next, check that your network connection is up and running. You will be able to see if the network is connected when the network icon appears on the top row. If it shows Wired Off, make sure to turn it on by clicking on the arrow and choosing \"Connect\". If you are using something other than an English keyboard, click on the en1 symbol on the top bar to switch to a different layout. If your keyboard is not listed, you will need to go into Settings and add your keyboard layout. You may also want to consider making the screen size larger. Use the drop-down menu at the top of the screen to select System Tools -> Settings. In the Devices section of the Setting menu, select Displays and choose a resolution that is suitable for your environment. Using External Ports with VMware/Virtual Box The labs assume that you are using a browser \"within\" your virtual machine console. However, both VMware and VirtualBox provide a method for accessing the ports on the virtual machine in your local environment. VMware For VMware, the easiest way to connect to the virtual machine from your host machine is to use the ifconfig command to determine your virtual machine IP address. ifconfig Search for an ensxx** value in the output from the command. There you should see the inet address of your virtual machine ( 172.16.210.237 ). To access the Portainer application from your local browser, you would use this address followed by the Portainer PORT number: https://172.16.210.237:6443 . Remember that inside your virtual machine, you will be using https://localhost:6443 . The following PORT numbers are open in the machine: 9443 - IBM watsonx.data management console 8080 - Presto console 9001 - MinIO console (S3 buckets) 6443 - Portainer (Docker container management) 8088 - Apache Superset (Query and Graphing) 5901 - VNC Access (Access to GUI in the machine) 7681 - SSH (Terminal access) via Browser 22 - SSH (Terminal access) via local terminal program 8443 - Presto External Port (dBeaver connection) 5432 - Postgres External Port (dBeaver connection) VirtualBox VirtualBox does not externalize the IP address of the virtual machine. The ifconfig command will provide an IP address of the machine, but it will not be reachable from your host browser. To open the ports, you must use the network option on the virtual machine. This step can be done while the machine is running. From the VirtualBox console, choose Settings for the machine and then click on the Network option. Press the Advanced option near the bottom of the dialog. Select the Port Forwarding button. This will display the port forwarding menu. You must place an entry for each port that we want to externalize to the host machine. If the value for Host IP is empty (blank), it defaults to localhost. In the example above, the 5901 port in the Guest machine (watsonxdata) is mapped to the host machines 5901 port. To access VNC, you would use localhost:5901 . If the guest machine port conflicts with the host machine port number, you can use a different port number. Terminal Command Window All the commands in the lab will require you execute commands in a terminal window. In addition, the labs require access to the root userid, and this can be accomplished in two ways that are described below. Local Terminal Shell Use a local terminal shell (iterm, Hyper, terminal) and use the SSH command to shell into the machine. For the VMware image, you need to know the IP address of the image and the port number that has been exposed for SSH command (default is 22). Assuming that your VMware machine has an IP address of 172.16.210.237 , the command to SSH into the machine would be: ssh watsonx@172.16.210.237 You will need to accept the unknown host warning and then provide the password for the watsonx userid: watsonx.data . At this point you are connected as the watsonx user. To become the root user, you must enter the following command in the terminal window. sudo su - Now as the root user you will be ready to run the commands found in the lab. Terminal Window in Virtual Machine You can use the Terminal application in the virtual machine to issue commands. This will open up the terminal window. At this point you are connected as the watsonx user. You can ignore any lab instructions that ask you to ssh into the watsonx server. To become the root user, you must enter the following command in the terminal window. sudo su - Now as the root user you will be ready to run the commands found in the lab.","title":"VMWare Image"},{"location":"wxd-vmware/#ibm-watsonxdata-vmware-image","text":"The IBM watsonx.data lab can be run in a virtual machine environment using VMWare Workstation, VMWare Fusion, or Oracle VirtualBox. The location of the OVA file (a compressed OS image format) is provided in the TechZone page for the lab: https://techzone.ibm.com/collection/ibm-watsonxdata-developer-base-image Select the resources tab to get details on how to download the file. Download the watsonxdata.ova file onto your local machine and then use the import function of VMware or VirtualBox to register it with the system. Note : This virtual machine was created using X64 (Intel) hardware, so this will not work in an OSX environment using M1/M2 chips. Once the machine is imported you can delete the OVA file. Before starting the machine, you may want to adjust the hardware requirements. vCPUs \u2013 4 VPCs minimum Memory \u2013 16Gb minimum (You can try 12Gb but tight!) Disk \u2013 30Gb initial size, but the image will grow in size Disable side channel mitigation ON (VMware only)","title":"IBM watsonx.data VMware Image"},{"location":"wxd-vmware/#vmware-urls","text":"All the URLs in the lab use 192.168.252.2 as the host. When running in the VMware image, you must use localhost for the addresses. You must substitute localhost for the 192.168.252.2 address when you come across it in the documentation. The following URLs and Ports are used to access the watsonx.data services. The ports that are used in the lab are listed below. https://localhost:9443 - watsonx.data management console http://localhost:8080 - Presto console http://localhost:9001 - MinIO console (S3 buckets) https://localhost:6443 - Portainer (Docker container management) http://localhost:8088 - Apache Superset (Query and Graphing) 8443 - Presto External Port 5432 - Postgres External Port 50000 - Db2 Database Port The Apache Superset link will not be active until started as part of the lab. These links have been placed into the Firefox browser for your convenience.","title":"VMware URLs"},{"location":"wxd-vmware/#starting-the-vmware-image","text":"When the machine starts, you will be prompted with the logon screen. There are two userids that we will be using in the VMware image: root \u2013 password watsonx.data watsonx \u2013 password watsonx.data When successfully logged in you should see the following screen. Next, check that your network connection is up and running. You will be able to see if the network is connected when the network icon appears on the top row. If it shows Wired Off, make sure to turn it on by clicking on the arrow and choosing \"Connect\". If you are using something other than an English keyboard, click on the en1 symbol on the top bar to switch to a different layout. If your keyboard is not listed, you will need to go into Settings and add your keyboard layout. You may also want to consider making the screen size larger. Use the drop-down menu at the top of the screen to select System Tools -> Settings. In the Devices section of the Setting menu, select Displays and choose a resolution that is suitable for your environment.","title":"Starting the VMware Image"},{"location":"wxd-vmware/#using-external-ports-with-vmwarevirtual-box","text":"The labs assume that you are using a browser \"within\" your virtual machine console. However, both VMware and VirtualBox provide a method for accessing the ports on the virtual machine in your local environment.","title":"Using External Ports with VMware/Virtual Box"},{"location":"wxd-vmware/#vmware","text":"For VMware, the easiest way to connect to the virtual machine from your host machine is to use the ifconfig command to determine your virtual machine IP address. ifconfig Search for an ensxx** value in the output from the command. There you should see the inet address of your virtual machine ( 172.16.210.237 ). To access the Portainer application from your local browser, you would use this address followed by the Portainer PORT number: https://172.16.210.237:6443 . Remember that inside your virtual machine, you will be using https://localhost:6443 . The following PORT numbers are open in the machine: 9443 - IBM watsonx.data management console 8080 - Presto console 9001 - MinIO console (S3 buckets) 6443 - Portainer (Docker container management) 8088 - Apache Superset (Query and Graphing) 5901 - VNC Access (Access to GUI in the machine) 7681 - SSH (Terminal access) via Browser 22 - SSH (Terminal access) via local terminal program 8443 - Presto External Port (dBeaver connection) 5432 - Postgres External Port (dBeaver connection)","title":"VMware"},{"location":"wxd-vmware/#virtualbox","text":"VirtualBox does not externalize the IP address of the virtual machine. The ifconfig command will provide an IP address of the machine, but it will not be reachable from your host browser. To open the ports, you must use the network option on the virtual machine. This step can be done while the machine is running. From the VirtualBox console, choose Settings for the machine and then click on the Network option. Press the Advanced option near the bottom of the dialog. Select the Port Forwarding button. This will display the port forwarding menu. You must place an entry for each port that we want to externalize to the host machine. If the value for Host IP is empty (blank), it defaults to localhost. In the example above, the 5901 port in the Guest machine (watsonxdata) is mapped to the host machines 5901 port. To access VNC, you would use localhost:5901 . If the guest machine port conflicts with the host machine port number, you can use a different port number.","title":"VirtualBox"},{"location":"wxd-vmware/#terminal-command-window","text":"All the commands in the lab will require you execute commands in a terminal window. In addition, the labs require access to the root userid, and this can be accomplished in two ways that are described below.","title":"Terminal Command Window"},{"location":"wxd-vmware/#local-terminal-shell","text":"Use a local terminal shell (iterm, Hyper, terminal) and use the SSH command to shell into the machine. For the VMware image, you need to know the IP address of the image and the port number that has been exposed for SSH command (default is 22). Assuming that your VMware machine has an IP address of 172.16.210.237 , the command to SSH into the machine would be: ssh watsonx@172.16.210.237 You will need to accept the unknown host warning and then provide the password for the watsonx userid: watsonx.data . At this point you are connected as the watsonx user. To become the root user, you must enter the following command in the terminal window. sudo su - Now as the root user you will be ready to run the commands found in the lab.","title":"Local Terminal Shell"},{"location":"wxd-vmware/#terminal-window-in-virtual-machine","text":"You can use the Terminal application in the virtual machine to issue commands. This will open up the terminal window. At this point you are connected as the watsonx user. You can ignore any lab instructions that ask you to ssh into the watsonx server. To become the root user, you must enter the following command in the terminal window. sudo su - Now as the root user you will be ready to run the commands found in the lab.","title":"Terminal Window in Virtual Machine"},{"location":"wxd-watsonui/","text":"Using the watsonx.data console UI Your TechZone reservation will include the server name and port number to use when connecting to the watsonx.data UI. The default port number is 9443, while the server will be referred to as region.techzone-server.com . Replace these values with those found in your reservation. Open your browser and navigate to: Watsonx.data UI - https://region.techzone-server.com:port Credentials: username: ibmlhadmin password: password Note: You will get a Certificate error in Firefox: Select Advanced. Choose \"Accept the Risk and Continue\". If you are using Google Chrome, you can bypass the error message by typing in \"thisisunsafe\" or clicking on the \"Proceed to server name (unsafe)\" link. The server name value will be replaced with the name of the TechZone server you are connecting to. The watsonx.data UI will display. The userid is ibmlhadmin with password of password . Note : If you see the following screen when first connecting to the UI, this is an indication that the service has not completely initialized. Dismiss all the error messages and then click on the Person icon (far right side above the messages) and Logout. Close the browser window after logging out and open the web page again until you get the proper login screen. At this point you will be connected to the console. Watsonx.data UI Navigation The main screen provides a snapshot of the objects that are currently found in the watsonx.data system. The infrastructure components shows that there is 1 engine, 2 catalogs and 2 buckets associated with the system. You can examine these objects by using the menu system found at the left side of the screen. Click on the hamburger icon. This will provide a list of items that you can explore in the UI. You can also access this list by clicking on one of the following icons. You can explore the various menus to see how the UI works. A brief description of the items is found below. Infrastructure manager - Displays the current engines, buckets and databases associated with the installation. Data Manager - Used to explore the various data sources that are catalogued in the system. You can explore the schemas, tables, table layout and view a subset of the data with this option. The display make take a few minutes to show the schemas in the system as it is querying the catalog and populating the descriptions on the screen. Query Workplace - A SQL-based query tool for accessing the data. Query History - A list of SQL queries that were previously run across all engines. Access Control - Control who can access the data. Try using the Data Explorer and Query engine to access some of the data in the pre-defined TPCH schema.","title":"watsonx.data UI"},{"location":"wxd-watsonui/#using-the-watsonxdata-console-ui","text":"Your TechZone reservation will include the server name and port number to use when connecting to the watsonx.data UI. The default port number is 9443, while the server will be referred to as region.techzone-server.com . Replace these values with those found in your reservation. Open your browser and navigate to: Watsonx.data UI - https://region.techzone-server.com:port Credentials: username: ibmlhadmin password: password Note: You will get a Certificate error in Firefox: Select Advanced. Choose \"Accept the Risk and Continue\". If you are using Google Chrome, you can bypass the error message by typing in \"thisisunsafe\" or clicking on the \"Proceed to server name (unsafe)\" link. The server name value will be replaced with the name of the TechZone server you are connecting to. The watsonx.data UI will display. The userid is ibmlhadmin with password of password . Note : If you see the following screen when first connecting to the UI, this is an indication that the service has not completely initialized. Dismiss all the error messages and then click on the Person icon (far right side above the messages) and Logout. Close the browser window after logging out and open the web page again until you get the proper login screen. At this point you will be connected to the console.","title":"Using the watsonx.data console UI"},{"location":"wxd-watsonui/#watsonxdata-ui-navigation","text":"The main screen provides a snapshot of the objects that are currently found in the watsonx.data system. The infrastructure components shows that there is 1 engine, 2 catalogs and 2 buckets associated with the system. You can examine these objects by using the menu system found at the left side of the screen. Click on the hamburger icon. This will provide a list of items that you can explore in the UI. You can also access this list by clicking on one of the following icons. You can explore the various menus to see how the UI works. A brief description of the items is found below. Infrastructure manager - Displays the current engines, buckets and databases associated with the installation. Data Manager - Used to explore the various data sources that are catalogued in the system. You can explore the schemas, tables, table layout and view a subset of the data with this option. The display make take a few minutes to show the schemas in the system as it is querying the catalog and populating the descriptions on the screen. Query Workplace - A SQL-based query tool for accessing the data. Query History - A list of SQL queries that were previously run across all engines. Access Control - Control who can access the data. Try using the Data Explorer and Query engine to access some of the data in the pre-defined TPCH schema.","title":"Watsonx.data UI Navigation"}]}
\ No newline at end of file
diff --git a/sitemap.xml b/sitemap.xml
index 2861bfc..8c37180 100644
--- a/sitemap.xml
+++ b/sitemap.xml
@@ -2,227 +2,227 @@
 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-08</lastmod>
+         <lastmod>2024-03-11</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-08</lastmod>
+         <lastmod>2024-03-11</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-08</lastmod>
+         <lastmod>2024-03-11</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-08</lastmod>
+         <lastmod>2024-03-11</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-08</lastmod>
+         <lastmod>2024-03-11</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-08</lastmod>
+         <lastmod>2024-03-11</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-08</lastmod>
+         <lastmod>2024-03-11</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-08</lastmod>
+         <lastmod>2024-03-11</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-08</lastmod>
+         <lastmod>2024-03-11</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-08</lastmod>
+         <lastmod>2024-03-11</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-08</lastmod>
+         <lastmod>2024-03-11</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-08</lastmod>
+         <lastmod>2024-03-11</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-08</lastmod>
+         <lastmod>2024-03-11</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-08</lastmod>
+         <lastmod>2024-03-11</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-08</lastmod>
+         <lastmod>2024-03-11</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-08</lastmod>
+         <lastmod>2024-03-11</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-08</lastmod>
+         <lastmod>2024-03-11</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-08</lastmod>
+         <lastmod>2024-03-11</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-08</lastmod>
+         <lastmod>2024-03-11</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-08</lastmod>
+         <lastmod>2024-03-11</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-08</lastmod>
+         <lastmod>2024-03-11</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-08</lastmod>
+         <lastmod>2024-03-11</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-08</lastmod>
+         <lastmod>2024-03-11</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-08</lastmod>
+         <lastmod>2024-03-11</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-08</lastmod>
+         <lastmod>2024-03-11</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-08</lastmod>
+         <lastmod>2024-03-11</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-08</lastmod>
+         <lastmod>2024-03-11</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-08</lastmod>
+         <lastmod>2024-03-11</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-08</lastmod>
+         <lastmod>2024-03-11</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-08</lastmod>
+         <lastmod>2024-03-11</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-08</lastmod>
+         <lastmod>2024-03-11</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-08</lastmod>
+         <lastmod>2024-03-11</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-08</lastmod>
+         <lastmod>2024-03-11</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-08</lastmod>
+         <lastmod>2024-03-11</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-08</lastmod>
+         <lastmod>2024-03-11</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-08</lastmod>
+         <lastmod>2024-03-11</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-08</lastmod>
+         <lastmod>2024-03-11</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-08</lastmod>
+         <lastmod>2024-03-11</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-08</lastmod>
+         <lastmod>2024-03-11</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-08</lastmod>
+         <lastmod>2024-03-11</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-08</lastmod>
+         <lastmod>2024-03-11</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-08</lastmod>
+         <lastmod>2024-03-11</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-08</lastmod>
+         <lastmod>2024-03-11</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-08</lastmod>
+         <lastmod>2024-03-11</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-08</lastmod>
+         <lastmod>2024-03-11</lastmod>
          <changefreq>daily</changefreq>
     </url>
 </urlset>
\ No newline at end of file
diff --git a/sitemap.xml.gz b/sitemap.xml.gz
index 9851801765dd247cfc6272b5d79e1c10b5e22460..44f84a8d394645e69cb6e4fc4d76971073097454 100644
GIT binary patch
delta 204
zcmcc5c%PA7zMF$XyZHS?cG>y{zAUesGIPH$a)c-}idC<Ear<;U5C4OTcb9Hoo?NF?
zUB32z&@`#V!D8)#pQq~i+<c*xy8X4K#Ri|MN14$<=j2b%3JSBnKHoyVO>xPwm3r?r
zO}0G>x$?O6htwwx8>5#$AN$Db+_qU{+PX(#EzA6EZ=L<t887u&lp^o4TcPjxDVtfj
zS*~+md)-}Ib;2nBR{w+9DS|)yS@>iu3LYG2WM)TY{6G4<dCKJYyZ^Zj(!>}T0M_1K
Ax&QzG

delta 204
zcmcc5c%PA7zMF$X+w}EBcG-GS*$Y!PSz5hh7FglHq+7K5#qHDaJp2zT-d(zVd2*dm
z?e^aPmu#<IQ^^tMdX_!;_9C;?LwVPW1vZCNe!6MCWUl=*Uez^4>#Pgyc$_`9Ue3K1
znDKj3a9pqWqwEu|6)C4aAD?L#v8^&Cxwv$ro3~x=?e?CSY;Muc7Xlr%3Xe8!vPqjA
z6_>i~%J(??kcn5%x_|QB)bYoiiJeczqTs=SMrIVohyRa0JKkEUuQw;GIZcd#0RYk`
BUCaOg

diff --git a/wxd-certificate/index.html b/wxd-certificate/index.html
index db73abd..fd8181c 100644
--- a/wxd-certificate/index.html
+++ b/wxd-certificate/index.html
@@ -1204,8 +1204,9 @@ <h3 id="step-5-generate-java-keystore-file">Step 5: Generate Java Keystore File<
   DNSName: ibm-lh-prestissimo-svc
   DNSName: ibm-lh-qhmm
   DNSName: ibm-lh-qhmm-svc
-  DNSName: *.services.cloud.techzone.ibm.com
   DNSName: watsonxdata
+  DNSName: watsonxdata.gym.lan
+  DNSName: *.services.cloud.techzone.ibm.com
 ]
 
 Trust this certificate? [no]:  yes