diff --git a/docusaurus.config.js b/docusaurus.config.js index 12afef6bfa..39e2819049 100644 --- a/docusaurus.config.js +++ b/docusaurus.config.js @@ -36,13 +36,14 @@ const config = { attributes: { 'http-equiv': 'Content-Security-Policy', content: - "default-src 'self' 'unsafe-inline' 'unsafe-eval' data: https://avatars.githubusercontent.com https://github.com https://kit.fontawesome.com/ https://ka-f.fontawesome.com/ https://fonts.googleapis.com/ https://fonts.gstatic.com/ https://www.google-analytics.com/ https://www.googletagmanager.com/ https://*.algolia.net/;", + "default-src 'self' 'unsafe-inline' 'unsafe-eval' data: https://avatars.githubusercontent.com https://github.com https://kit.fontawesome.com/ https://ka-f.fontawesome.com/ https://fonts.googleapis.com/ https://fonts.gstatic.com/ https://www.youtube.com/ https://www.google-analytics.com/ https://www.googletagmanager.com/ https://*.algolia.net/;", }, }, { // Load font awesome icons tagName: 'script', attributes: { + defer: 'true', src: 'https://kit.fontawesome.com/17a35e44e3.js', crossorigin: 'anonymous', }, @@ -88,6 +89,15 @@ const config = { path: './releases', }, ], + [ + '@docusaurus/plugin-content-docs', + { + id: 'quickstarts', + path: 'quickstarts', + routeBasePath: 'quickstarts', + sidebarPath: './sidebars.js', + }, + ], ], presets: [ @@ -160,7 +170,7 @@ const config = { title: 'Developers', navItems: [ { - href: 'https://quickstarts.teradata.com/', + href: `${baseUrl}/quickstarts/business-intelligence/create-stunning-visualizations-in-power-bi-using-data-from-teradata-vantage/`, label: 'Getting started', }, { diff --git a/quickstarts/_partials/_clone-repo.mdx b/quickstarts/_partials/_clone-repo.mdx new file mode 100644 index 0000000000..dd65ce167e --- /dev/null +++ b/quickstarts/_partials/_clone-repo.mdx @@ -0,0 +1,11 @@ + + +The `deployments` folder in the [AI Unlimited GitHub repository](https://github.com/Teradata/ai-unlimited) that Teradata provides contains template, parameter, and policy files for installing AI Unlimited. + + Open a terminal window, and clone the repository. + + ``` bash + git clone https://github.com/Teradata/ai-unlimited + ``` + + diff --git a/quickstarts/_partials/community_link.mdx b/quickstarts/_partials/community_link.mdx new file mode 100644 index 0000000000..4f7f0c0506 --- /dev/null +++ b/quickstarts/_partials/community_link.mdx @@ -0,0 +1,3 @@ +:::note +If you have any questions or need further assistance, please visit our [community forum](https://support.teradata.com/community) where you can get support and interact with other community members. +::: \ No newline at end of file diff --git a/quickstarts/_partials/getting-started-intro.mdx b/quickstarts/_partials/getting-started-intro.mdx new file mode 100644 index 0000000000..debfc07839 --- /dev/null +++ b/quickstarts/_partials/getting-started-intro.mdx @@ -0,0 +1,9 @@ +import UseCase from './use-csae.mdx'; + + +## Overview +This how-to shows how to gain access to a Teradata database by running it on your local machine. Once you finish the steps you will have a working Teradata Vantage Express database on your computer. + +:::note +Starting with version 17.20, Vantage Express includes the following analytics packages: [Vantage Analytics Library](https://docs.teradata.com/r/Vantage-Analytics-Library-User-Guide/January-2022), [Bring Your Own Model (BYOM)](https://docs.teradata.com/r/Teradata-VantageTM-Bring-Your-Own-Model-User-Guide/May-2022), [API Integration with AWS SageMaker](https://docs.teradata.com/r/Teradata-VantageTM-API-Integration-Guide-for-Cloud-Machine-Learning/April-2022). +::: \ No newline at end of file diff --git a/quickstarts/_partials/getting-started-queries.mdx b/quickstarts/_partials/getting-started-queries.mdx new file mode 100644 index 0000000000..2372787db9 --- /dev/null +++ b/quickstarts/_partials/getting-started-queries.mdx @@ -0,0 +1,54 @@ +```sql +CREATE DATABASE HR +AS PERMANENT = 60e6, -- 60MB + SPOOL = 120e6; -- 120MB +``` + +5. Let’s create a sample table and insert some data and query it. We will first create a table to hold employee information: + +```sql +CREATE SET TABLE HR.Employees ( + GlobalID INTEGER, + FirstName VARCHAR(30), + LastName VARCHAR(30), + DateOfBirth DATE FORMAT 'YYYY-MM-DD', + JoinedDate DATE FORMAT 'YYYY-MM-DD', + DepartmentCode BYTEINT +) +UNIQUE PRIMARY INDEX ( GlobalID ); +``` + +6. Now, let's insert a record: + +```sql +INSERT INTO HR.Employees ( + GlobalID, + FirstName, + LastName, + DateOfBirth, + JoinedDate, + DepartmentCode +) +VALUES ( + 101, + 'Adam', + 'Tworkowski', + '1980-01-05', + '2004-08-01', + 01 +); +``` + +7. Finally, let's see if we can retrieve the data: + +```sql +SELECT * FROM HR.Employees; +``` + +You should get the following results: + +```sql +GlobalID FirstName LastName DateOfBirth JoinedDate DepartmentCode +-------- --------- ---------- ----------- ---------- -------------- + 101 Adam Tworkowski 1980-01-05 2004-08-01 1 +``` \ No newline at end of file diff --git a/quickstarts/_partials/getting-started-summary.mdx b/quickstarts/_partials/getting-started-summary.mdx new file mode 100644 index 0000000000..a95361e108 --- /dev/null +++ b/quickstarts/_partials/getting-started-summary.mdx @@ -0,0 +1,3 @@ +## Summary + +In this guide we have covered how to quickly create a working Teradata environment. We used Teradata Vantage Express in a VM running on VMware. In the same VM, we ran Teradata Studio Express to issue queries. We installed all software locally and didn't have to pay for cloud resources. \ No newline at end of file diff --git a/quickstarts/_partials/install-ve-in-public.mdx b/quickstarts/_partials/install-ve-in-public.mdx new file mode 100644 index 0000000000..55c0b75805 --- /dev/null +++ b/quickstarts/_partials/install-ve-in-public.mdx @@ -0,0 +1,112 @@ +import GettingStartedQueries from './getting-started-queries.mdx'; + +run. Install VirtualBox and 7zip: +```bash +apt update && apt-get install p7zip-full p7zip-rar virtualbox -y +``` +5. Retrieve the curl command to download Vantage Express. +* Go to [Vantage Expess download page](https://downloads.teradata.com/download/database/teradata-express-for-vmware-player) (registration required). +* Click on the latest download link, e.g. "Vantage Express 17.20". You will see a license agreement popup. Don't accept the license yet. +* Open the network view in your browser. For example, in Chrome press kbd:[F12] and navigate to `Network` tab: + +![Browser Network Tab](../images/browser.network.png) + +* Accept the license by clicking on `I Agree` button and cancel the download. +* In the network view, find the last request that starts with `VantageExpress`. Right click on it and select `Copy -> Copy as cURL`: + +![Browser Copy culr](../images/browser.copy.curl.png) +* Head back to the ssh session and download Vantage Express by pasting the curl command. Add `-o ve.7z` to the command to save the download to file named `ve.7z`. You can remove all the HTTP headers, e.g.: + +```bash +curl -o ve.7z 'http://d289lrf5tw1zls.cloudfront.net/database/teradata-express/VantageExpress17.20_Sles12_202108300444.7z?Expires=1638719978&Signature=GKBkNvery_long_signature__&Key-Pair-Id=********************' +``` + +* Unzip the downloaded file. It will take several minutes: + +```bash +7z x ve.7z +``` +* Start the VM in VirtualBox. The command will return immediately but the VM init process will take several minutes: + +```bash +export VM_IMAGE_DIR="/opt/downloads/VantageExpress17.20_Sles12" +DEFAULT_VM_NAME="vantage-express" +VM_NAME="${VM_NAME:-$DEFAULT_VM_NAME}" +vboxmanage createvm --name "$VM_NAME" --register --ostype openSUSE_64 +vboxmanage modifyvm "$VM_NAME" --ioapic on --memory 6000 --vram 128 --nic1 nat --cpus 4 +vboxmanage storagectl "$VM_NAME" --name "SATA Controller" --add sata --controller IntelAhci +vboxmanage storageattach "$VM_NAME" --storagectl "SATA Controller" --port 0 --device 0 --type hdd --medium "$(find $VM_IMAGE_DIR -name '*disk1*')" +vboxmanage storageattach "$VM_NAME" --storagectl "SATA Controller" --port 1 --device 0 --type hdd --medium "$(find $VM_IMAGE_DIR -name '*disk2*')" +vboxmanage storageattach "$VM_NAME" --storagectl "SATA Controller" --port 2 --device 0 --type hdd --medium "$(find $VM_IMAGE_DIR -name '*disk3*')" +vboxmanage modifyvm "$VM_NAME" --natpf1 "tdssh,tcp,,4422,,22" +vboxmanage modifyvm "$VM_NAME" --natpf1 "tddb,tcp,,1025,,1025" +vboxmanage startvm "$VM_NAME" --type headless +vboxmanage controlvm "$VM_NAME" keyboardputscancode 1c 1c +``` +* ssh to Vantage Express VM. Use `root` as password: + +```bash +ssh -p 4422 root@localhost +``` +* Validate that the DB is up: +```bash +pdestate -a +``` + +If the command returns `PDE state is RUN/STARTED. DBS state is 5: Logons are enabled - The system is quiescent`, it means that Vantage Express has started. +If the status is different, repeat `pdestate -a` till you get the correct status. + +* Once Vantage Express is up and running, start `bteq` client command line client. BTEQ (pronounced “bee-teek”) is a general-purpose, command-based client tool used to submit SQL queries to a Teradata Database. + +```bash +bteq +``` +* Once in bteq, connect to your Vantage Express instance. When asked for the password, enter `dbc`: + +```bash +.logon localhost/dbc +``` + +## Run sample queries + +* Using `dbc` user, we will create a new database called `HR`. Copy/paste this query and run press kbd:[Enter]: + + + +## Optional setup + +* If you intend to stop and start the VM, you may want to add Vantage Express to autostart. ssh to your VM and run the following commands: + +```bash +sudo -i + +cat <> /etc/default/virtualbox +VBOXAUTOSTART_DB=/etc/vbox +VBOXAUTOSTART_CONFIG=/etc/vbox/autostart.cfg +EOF + +cat < /etc/systemd/system/vantage-express.service +[Unit] +Description=vm1 +After=network.target virtualbox.service +Before=runlevel2.target shutdown.target +[Service] +User=root +Group=root +Type=forking +Restart=no +TimeoutSec=5min +IgnoreSIGPIPE=no +KillMode=process +GuessMainPID=no +RemainAfterExit=yes +ExecStart=/usr/bin/VBoxManage startvm vantage-express --type headless +ExecStop=/usr/bin/VBoxManage controlvm vantage-express savestate +[Install] +WantedBy=multi-user.target +EOF + +systemctl daemon-reload +systemctl enable vantage-express +systemctl start vantage-express +``` diff --git a/quickstarts/_partials/jupyter_notebook_clearscape_analytics_note.mdx b/quickstarts/_partials/jupyter_notebook_clearscape_analytics_note.mdx new file mode 100644 index 0000000000..563c5bd0d9 --- /dev/null +++ b/quickstarts/_partials/jupyter_notebook_clearscape_analytics_note.mdx @@ -0,0 +1,3 @@ +:::note +This how-to shows you how to add Teradata Extensions to a Jupyter Notebooks environment. A hosted version of Jupyter Notebooks integrated with Teradata Extensions and analytics tools is available for functional testing for free at https://clearscape.teradata.com. +::: \ No newline at end of file diff --git a/quickstarts/_partials/modelops-basic.mdx b/quickstarts/_partials/modelops-basic.mdx new file mode 100644 index 0000000000..4e9c86873a --- /dev/null +++ b/quickstarts/_partials/modelops-basic.mdx @@ -0,0 +1,151 @@ +## Create a new Project or use an existing one + +Add a new Project + +* create project + +* Details + +* Name: Demo: your-name + +* Description: ModelOps Demo + +* Group: your-name + +* Path: https://github.com/Teradata/modelops-demo-models + +* Credentials: No Credentials + +* Branch: master + +Here you can test the git connection. If is green then save and continue. Skip the service connection settings for now. + +When creating a new project, ModelOps will ask you for a new connection. + +## Create a Personal Connection + +Personal connection + +* Name: Vantage personal your-name + +* Description: Vantage demo env + +* Host: tdprd.td.teradata.com (internal for teradata transcend only) + +* Database: your-db + +* VAL Database: TRNG_XSP (internal for teradata transcend only) + +* BYOM Database: TRNG_BYOM (internal for teradata transcend only) + +* Login Mech: TDNEGO + +* Username/Password + +## Validate permissions in SQL database for VAL and BYOM + +You can check the permissions with the new healthcheck panel in the connections panel + +![ModelOps Healtcheck screenshot](../modelops/images/ModelOps_Healthcheck.png) + +## Add dataset to identify Vantage tables for BYOM evaluation and scoring + +Let's create a new dataset template, then 1 dataset for training and 2 datasets for evaluation so we can monitor model quality metrics with 2 different datasets + +Add datasets + +* create dataset template + +* Catalog + +* Name: PIMA + +* Description: PIMA Diabetes + +* Feature Catalog: Vantage + +* Database: your-db + +* Table: aoa_feature_metadata + +Features +Query: +``` sql +SELECT * FROM {your-db}.pima_patient_features +``` +Entity Key: PatientId +Features: NumTimesPrg, PlGlcConc, BloodP, SkinThick, TwoHourSerIns, BMI, DiPedFunc, Age + +Entity & Target +Query: +``` sql +SELECT * FROM {your-db}.pima_patient_diagnoses +``` +Entity Key: PatientId +Target: HasDiabetes + +Predictions + +* Database: your-db + +* Table: pima_patient_predictions + +Entity selection: + +Query: +``` sql +SELECT * FROM pima_patient_features WHERE patientid MOD 5 = 0 +``` +Only for v6 (in v7 you will define this in the BYOM no code screen): BYOM Target Column: CAST(CAST(json_report AS JSON).JSONExtractValue('$.predicted_HasDiabetes') AS INT) + +## Create training dataset + +Basic + +* Name: Train + +* Description: Training dataset + +* Scope: Training + +* Entity & Target + +Query: +``` sql +SELECT * FROM {your-db}.pima_patient_diagnoses WHERE patientid MOD 5 = 1 +``` + +## Create evaluation dataset 1 + +Basic + +* Name: Evaluate + +* Description: Evaluation dataset + +* Scope: Evaluation + +* Entity & Target + +Query: +``` sql +SELECT * FROM {your-db}.pima_patient_diagnoses WHERE patientid MOD 5 = 2 +``` + + +## Create evaluation dataset 2 + +Basic + +* Name: Evaluate + +* Description: Evaluation dataset + +* Scope: Evaluation + +* Entity & Target + +Query: +``` sql +SELECT * FROM {your-db}.pima_patient_diagnoses WHERE patientid MOD 5 = 3 +``` \ No newline at end of file diff --git a/quickstarts/_partials/run-vantage.mdx b/quickstarts/_partials/run-vantage.mdx new file mode 100644 index 0000000000..d39394ecf0 --- /dev/null +++ b/quickstarts/_partials/run-vantage.mdx @@ -0,0 +1,80 @@ +- Press [ENTER] to select the highlighted `LINUX` boot partition. + +![Teradata Data Distribution](../images/run-vantage/boot-manager-menu.png) + +- On the next screen, press ENTER again to select the default SUSE Linux kernel. + +![Teradata Data Distribution](../images/run-vantage/grub-menu.png) + +- After completing the bootup sequence a terminal login prompt as shown in the screenshot below will appear. Don't enter anything in the terminal. Wait till the system starts the GUI. + +![Wait for GUI](../images/run-vantage/wait-for-gui.png) + +- After a while the following prompt will appear - assuming that you did not enter anything after the command login prompt above. Press `okay` button in the screen below. + +![OK Security Popup](../images/run-vantage/okay-the-security-popup.png) + +- Once the VM is up, you will see its desktop environment. When prompted for username/password enter `root` for both. + +![VM Login](../images/run-vantage/vm.login.png) + +- The database is configured to autostart with the VM. To confirm that the database has started go to the virtual desktop and start `Gnome Terminal`. + +![Start Gnome Terminal](../images/run-vantage/start-gnome-terminal.png) + +- In the terminal execute `pdestate` command that will inform you if Vantage has already started: + +:::note +To paste into Gnome Terminal press SHIFT+CTRL+V. +::: + +```bash +watch pdestate -a +``` + +You want to wait till you see the following message: + +```bash +PDE state is RUN/STARTED. +DBS state is 5: Logons are enabled - The system is quiescent +``` + +
+ +See examples of messages that pdestate returns when the database is still initializing. + +
+PDE state is DOWN/HARDSTOP.
+
+PDE state is START/NETCONFIG.
+
+PDE state is START/GDOSYNC.
+
+PDE state is START/TVSASTART.
+
+PDE state is START/READY.
+PDE state is RUN/STARTED.
+
+DBS state is 1/1: DBS Startup - Initializing DBS Vprocs
+PDE state is RUN/STARTED.
+
+DBS state is 1/5: DBS Startup - Voting for Transaction Recovery
+PDE state is RUN/STARTED.
+
+DBS state is 1/4: DBS Startup - Starting PE Partitions
+PDE state is RUN/STARTED.
+
+ +
+ +- Now that the database is up, go back to the virtual desktop and launch `Teradata Studio Express` + +![Start Teradata Studio Express](../images/run-vantage/start-teradata-studio-express.png) + +- When you first start it you will be offered a tour. Once you close the tour, you will see a wizard window to add a new connection. Select `Teradata`: + +![New Connection Profile](../images/run-vantage/new.connection.profile.png) + +- On the next screen, connect to the database on your localhost using `dbc` for the username and password: + +![New Connection](../images/run-vantage/new.connection.png) diff --git a/quickstarts/_partials/running-sample-queries.mdx b/quickstarts/_partials/running-sample-queries.mdx new file mode 100644 index 0000000000..6471ea1690 --- /dev/null +++ b/quickstarts/_partials/running-sample-queries.mdx @@ -0,0 +1,9 @@ +import GettingStartedQueries from './getting-started-queries.mdx'; + +## Run sample queries +1. We will now run some queries in the VM. To avoid copy/paste issues between the host and the VM, we will open this quick start in the VM. Go to the virtual desktop, start Firefox and point it to this quick start. +2. Once in Teradata Studio Express, go to `Query Development` perspective (go to the top menu and select `Window` -> `Query Development`). +3. Connect using the previously created connection profile by double-clicking on `Database Connections` -> `New Teradata`. +4. Using `dbc` user, we will create a new database called `HR`. Copy/paste this query and run it by hitting the run query button (image:run.query.button.png[Run Query Button, 24, 24]) or pressing [F5] key: + + \ No newline at end of file diff --git a/quickstarts/_partials/tabsAzure.mdx b/quickstarts/_partials/tabsAzure.mdx new file mode 100644 index 0000000000..3fc4c47214 --- /dev/null +++ b/quickstarts/_partials/tabsAzure.mdx @@ -0,0 +1,50 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + + + + ```bash + az disk create -n teradata-vantage-express --size-gb 60 + az vm create ` + --name teradata-vantage-express ` + --image UbuntuLTS ` + --admin-username azureuser ` + --ssh-key-name vantage-ssh-key ` + --size Standard_F4s_v2 ` + --public-ip-sku Standard + + $diskId = (az disk show -n teradata-vantage-express --query 'id' -o tsv) | Out-String + az vm disk attach --vm-name teradata-vantage-express --name $diskId + ``` + + + ```bash +az disk create -n teradata-vantage-express --size-gb 60 +az vm create \ + --name teradata-vantage-express \ + --image UbuntuLTS \ + --admin-username azureuser \ + --ssh-key-name vantage-ssh-key \ + --size Standard_F4s_v2 \ + --public-ip-sku Standard + +DISK_ID=$(az disk show -n teradata-vantage-express --query 'id' -o tsv) +az vm disk attach --vm-name teradata-vantage-express --name $DISK_ID + ``` + + + ```bash +az disk create -n teradata-vantage-express --size-gb 60 +az vm create \ + --name teradata-vantage-express \ + --image UbuntuLTS \ + --admin-username azureuser \ + --ssh-key-name vantage-ssh-key \ + --size Standard_F4s_v2 \ + --public-ip-sku Standard + +DISK_ID=$(az disk show -n teradata-vantage-express --query 'id' -o tsv) +az vm disk attach --vm-name teradata-vantage-express --name $DISK_ID + ``` + + \ No newline at end of file diff --git a/quickstarts/_partials/tabsDBT.mdx b/quickstarts/_partials/tabsDBT.mdx new file mode 100644 index 0000000000..5c432b250e --- /dev/null +++ b/quickstarts/_partials/tabsDBT.mdx @@ -0,0 +1,36 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + + + + Run in Powershell: + ```bash + gcloud compute instances create teradata-vantage-express ` + --zone=us-central1-a ` + --machine-type=n2-custom-4-8192 ` + --create-disk=boot=yes,device-name=ve-disk,image-project=ubuntu-os-cloud,image-family=ubuntu-2004-lts,size=70,type=pd-balanced ` + --enable-nested-virtualization ` + --tags=ve + ``` + + + ```bash + gcloud compute instances create teradata-vantage-express \ + --zone=us-central1-a \ + --machine-type=n2-custom-4-8192 \ + --create-disk=boot=yes,device-name=ve-disk,image-project=ubuntu-os-cloud,image-family=ubuntu-2004-lts,size=70,type=pd-balanced \ + --enable-nested-virtualization \ + --tags=ve + ``` + + + ```bash + gcloud compute instances create teradata-vantage-express \ + --zone=us-central1-a \ + --machine-type=n2-custom-4-8192 \ + --create-disk=boot=yes,device-name=ve-disk,image-project=ubuntu-os-cloud,image-family=ubuntu-2004-lts,size=70,type=pd-balanced \ + --enable-nested-virtualization \ + --tags=ve + ``` + + \ No newline at end of file diff --git a/quickstarts/_partials/tabsGCP.mdx b/quickstarts/_partials/tabsGCP.mdx new file mode 100644 index 0000000000..0b7cb1f05d --- /dev/null +++ b/quickstarts/_partials/tabsGCP.mdx @@ -0,0 +1,24 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + + + + Run in Powershell: + ```bash + python -m venv env + source env/Scripts/activate + ``` + + + ```bash + python3 -m venv env + source env/bin/activate + ``` + + + ```bash + python3 -m venv env + source env/bin/activate + ``` + + \ No newline at end of file diff --git a/quickstarts/_partials/tabsJupyterNotebook.mdx b/quickstarts/_partials/tabsJupyterNotebook.mdx new file mode 100644 index 0000000000..92070b75e1 --- /dev/null +++ b/quickstarts/_partials/tabsJupyterNotebook.mdx @@ -0,0 +1,20 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + + + + ```bash + docker run -e "accept_license=Y" -p 127.0.0.1:8888:8888 -v ${PWD}:/home/jovyan/JupyterLabRoot teradata/jupyterlab-extensions + ``` + + + ```bash + docker run -e "accept_license=Y" -p 127.0.0.1:8888:8888 -v $PWD:/home/jovyan/JupyterLabRoot teradata/jupyterlab-extensions + ``` + + + ```bash + docker run -e "accept_license=Y" -p 127.0.0.1:8888:8888 -v $PWD:/home/jovyan/JupyterLabRoot teradata/jupyterlab-extensions + ``` + + \ No newline at end of file diff --git a/quickstarts/_partials/tabsTPT.mdx b/quickstarts/_partials/tabsTPT.mdx new file mode 100644 index 0000000000..f0636d8639 --- /dev/null +++ b/quickstarts/_partials/tabsTPT.mdx @@ -0,0 +1,22 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + + + + Run in Powershell: + ```bash + Unzip the downloaded file and run `setup.exe`. + ``` + + + ```bash + Unzip the downloaded file and run `TeradataToolsAndUtilitiesXX.XX.XX.pkg`. + ``` + + + Unzip the downloaded file, go to the unzipped directory and run: + ```bash + ./setup.sh a + ``` + + \ No newline at end of file diff --git a/quickstarts/_partials/terraform-video.mdx b/quickstarts/_partials/terraform-video.mdx new file mode 100644 index 0000000000..85a66ec758 --- /dev/null +++ b/quickstarts/_partials/terraform-video.mdx @@ -0,0 +1,10 @@ + \ No newline at end of file diff --git a/quickstarts/_partials/use-csae.mdx b/quickstarts/_partials/use-csae.mdx new file mode 100644 index 0000000000..dfbaaeb894 --- /dev/null +++ b/quickstarts/_partials/use-csae.mdx @@ -0,0 +1,3 @@ +:::note +You can now get a hosted instance of Vantage for free at [https://clearscape.teradata.com](https://clearscape.teradata.com/). +::: \ No newline at end of file diff --git a/quickstarts/_partials/vantage_clearscape_analytics.mdx b/quickstarts/_partials/vantage_clearscape_analytics.mdx new file mode 100644 index 0000000000..389876ac42 --- /dev/null +++ b/quickstarts/_partials/vantage_clearscape_analytics.mdx @@ -0,0 +1,5 @@ +:::note +If you need a test instance of Vantage, you can provision one for free at [https://clearscape.teradata.com](https://clearscape.teradata.com/sign-in?utm_source=dev_portal&utm_medium=quickstart_tutorial&utm_campaign=quickstarts) +::: + +![test](../images/run-vantage/boot-manager-menu.png) \ No newline at end of file diff --git a/quickstarts/analyze-data/_category_.json b/quickstarts/analyze-data/_category_.json new file mode 100644 index 0000000000..2c6d475388 --- /dev/null +++ b/quickstarts/analyze-data/_category_.json @@ -0,0 +1,4 @@ +{ + "label": "Analyze data", + "position": 6 + } \ No newline at end of file diff --git a/quickstarts/analyze-data/create-stunning-visualizations-in-power-bi-using-data-from-teradata-vantage.md b/quickstarts/analyze-data/create-stunning-visualizations-in-power-bi-using-data-from-teradata-vantage.md new file mode 100644 index 0000000000..839dbc751b --- /dev/null +++ b/quickstarts/analyze-data/create-stunning-visualizations-in-power-bi-using-data-from-teradata-vantage.md @@ -0,0 +1,144 @@ +--- +sidebar_position: 12 +author: Kevin Bogusch, Paul Ibberson +email: paul.ibberson2@teradata.com +page_last_update: January 14th, 2022 +description: Connect Teradata Vantage to Power BI Desktop. +keywords: [data warehouses, compute storage separation, teradata, vantage, cloud data platform, object storage, business intelligence, enterprise analytics, microsoft power bi, power bi] +--- +import ClearscapeDocsNote from '../_partials/vantage_clearscape_analytics.mdx' + +# Create Vizualizations in Power BI using Vantage + +### Overview + +:::note +This guide includes content from both Microsoft and Teradata product documentation. +::: + +This article describes the process to connect your Power BI Desktop to Teradata Vantage for creating reports and dramatic visualizations of your data. Power BI supports Teradata Vantage as a data source and can use the underlying data just like any other data source in Power BI Desktop. + +[Power BI](https://docs.microsoft.com/en-us/power-bi/power-bi-overview) is a collection of software services, applications, and connectors that work together to turn your unrelated sources of data into coherent, visually immersive, and interactive insights. + +.Power BI consists of: +* A Windows desktop application, called [Power BI Desktop](https://docs.microsoft.com/en-us/power-bi/fundamentals/desktop-what-is-desktop) +* An online SaaS (Software as a Service) service, called the [Power BI service](https://docs.microsoft.com/en-us/power-bi/fundamentals/power-bi-service-overview) +* [Power BI mobile](https://docs.microsoft.com/en-us/power-bi/consumer/mobile/mobile-apps-for-mobile-devices) apps for Windows, iOS, and Android devices + +![Power BI elements](../business-intelligence/images/connect-power-bi/power.bi.elements.png) + +These three elements—Power BI Desktop, the Power BI service, and the mobile apps—are designed to let people create, share, and consume business insights in the way that serves them, or their role, most effectively. + +![Power BI overview blocks](../business-intelligence/images/connect-power-bi/power.bi.overview.blocks.png) + +A fourth element, [Power BI Report Server](https://docs.microsoft.com/en-us/power-bi/report-server/get-started), allows you to publish Power BI reports to an on-premises report server, after creating them in Power BI Desktop. + +Power BI Desktop supports Vantage as a 3rd party data source not as a ‘native’ data source. Instead, published reports on Power BI service will need to use the [on-premises data gateway](https://docs.microsoft.com/en-us/power-bi/connect-data/service-gateway-onprem) component to access Vantage. + +This getting started guide will show you how to connect to a Teradata Vantage. Power BI Desktop Teradata connector uses the [.NET Data Provider for Teradata](https://downloads.teradata.com/download/connectivity/net-data-provider-for-teradata). You need to install the driver on computers that use the Power BI Desktop. The .NET Data Provider for Teradata single installation supports both 32-bit or 64-bit Power BI Desktop application. + +### Prerequisites +You are expected to be familiar with Azure services, Teradata Vantage, and Power BI Desktop. + +You will need the following accounts and system. + +* The Power BI Desktop is a free application for Windows. (Power BI Desktop is not available for Macs. You could run it in a virtual machine, such as [Parallels](https://www.parallels.com) or [VMware Fusion](https://www.vmware.com/products/fusion.html), or in Apple’s [Boot Camp](https://support.apple.com/en-vn/boot-camp), but that is beyond the scope of this article.) + +* A Teradata Vantage instance with a user and password. The user must have permission to data that can be used by Power BI Desktop. Vantage must be accessible from Power BI Desktop. ++ + + + + + +* The [.NET Data Provider for Teradata](https://downloads.teradata.com/download/connectivity/net-data-provider-for-teradata). + +### Getting Started +### Install Power BI Desktop +You can install Power BI Desktop from the [Microsoft Store](https://aka.ms/pbidesktopstore) or [download the installer](https://aka.ms/pbiSingleInstaller) and run it directly. + +### Install the .NET Data Provider for Teradata +Download and install the latest version of the [.NET Data Provider for Teradata](https://downloads.teradata.com/download/connectivity/net-data-provider-for-teradata). + +Note that there are multiple files available for download. You want the file that starts with “tdnetdp”. + +### Connect to Teradata Vantage +* Run Power BI Desktop, which has a yellow icon. + +![Power BI icon](../business-intelligence/images/connect-power-bi/power.bi.icon.png) + +* If the opening (splash) screen is showing, click on Get data. + +![Power BI splash screen](../business-intelligence/images/connect-power-bi/power.bi.splash.screen.png) + +Otherwise, if you are in the main form of Power BI, ensure that you are on the _Home_ ribbon and click on _Get data_. Click on _More…_. + +![Power BI Get Data menu](../business-intelligence/images/connect-power-bi/power.bi.get.data.menu.png) + +* Click on _Database_ on the left. +* Scroll the list on the right until you see _Teradata database_. Click on _Teradata database_, and then click on the _Connect_ button. + +(“Teradata database” and “Teradata Vantage” are synonymous in this article.) + +![Power BI Database picker](../business-intelligence/images/connect-power-bi/power.bi.database.picker.png) + +* In the window that appears, enter the name or IP address of your Vantage system into the text box. You can choose to _Import_ data directly into Power BI data model, or connect directly to the data source using [DirectQuery](https://docs.microsoft.com/en-us/power-bi/desktop-use-directquery) and click _OK_. + +![Power BI server connection](../business-intelligence/images/connect-power-bi/power.bi.server.connect.png) + +(Click _Advanced_ options to submit hand-crafted SQL statement.) + +For credentials, you have the option of connecting with your _Windows_ login or _Database_ username defined in Vantage, which is more common. Select the appropriate authentication method and enter in your username and password. Click _Connect_. + +You also have the option of authenticating with an LDAP server. This option is hidden by default. + +If you set the environment variable, _PBI_EnableTeradataLdap_, to _true_, then the LDAP authentication method will become available. + +![Power BI LDAP connection](../business-intelligence/images/connect-power-bi/power.bi.ldap.png) + +Do note that LDAP is not supported with the on-premises data gateway, which is used for reports that are published to the Power BI service. If you need LDAP authentication and are using the on-premises data gateway, you will need to submit an incident to Microsoft and request support. + +Alternatively, you can [configure Kerberos-based SSO from Power BI service to on-premise data sources](https://docs.microsoft.com/en-us/power-bi/connect-data/service-gateway-sso-kerberos) like Teradata. + +Once you have connected to the Vantage system, Power BI Desktop remembers the credentials for future connections to the system. You can modify these credentials by going to _File > Options and settings > Data source settings_. + +The Navigator window appears after a successful connection. It displays the data available on the Vantage system. You can select one or more elements to use in Power BI Desktop. + +![Power BI Navigator](../business-intelligence/images/connect-power-bi/power.bi.navigator.png) + +You preview a table by clicking on its name. If you want to load it into Power BI Desktop, ensure that you click the checkbox next to the table name. + +You can _Load_ the selected table, which brings it into Power BI Desktop. You can also _Edit_ the query, which opens a query editor so you can filter and refine the set of data you want to load. + +_Edit_ may be called _Transform data_, depending upon the version of Power BI Desktop that you have. + +For information on joining tables, see [Create and Manage Relationships in Power BI Desktop](https://docs.microsoft.com/en-us/power-bi/desktop-create-and-manage-relationships) feature. + +To publish your report, click _Publish_ on _Home_ ribbon in Power BI Desktop. + +![Power BI Publish](../business-intelligence/images/connect-power-bi/power.bi.publish.png) + +Power BI Desktop will prompt you to save your report. Choose _My workspace_ and click _Select_. + +![Power BI publish to my workspace](../business-intelligence/images/connect-power-bi/power.bi.workspace.png) + +Once report has been published, click _Got it_ to close. You may also click the link, which has the report name in the link. + +![Power BI successfully published](../business-intelligence/images/connect-power-bi/power.bi.success.png) + +This is an example of a report created in Power BI Desktop. + +![Power BI Report](../business-intelligence/images/connect-power-bi/power.bi.report.png) + +### Next steps +You can combine data from many sources with Power BI Desktop. Look at the following links for more information. + +* [What is Power BI Desktop?](https://docs.microsoft.com/en-us/power-bi/desktop-what-is-desktop) +* [Data Sources in Power BI Desktop](https://docs.microsoft.com/en-us/power-bi/desktop-data-sources) +* [Shape and Combine Data with Power BI Desktop](https://docs.microsoft.com/en-us/power-bi/desktop-shape-and-combine-data) +* [Connect to Excel workbooks in Power BI Desktop](https://docs.microsoft.com/en-us/power-bi/desktop-connect-excel) +* [Enter data directly into Power BI Desktop](https://docs.microsoft.com/en-us/power-bi/desktop-enter-data-directly-into-desktop) + +import CommunityLinkPartial from '../_partials/community_link.mdx'; + + diff --git a/quickstarts/analyze-data/deploy-and-monitor-machine-learning-models-with-teradata-modelops-and-byom.md b/quickstarts/analyze-data/deploy-and-monitor-machine-learning-models-with-teradata-modelops-and-byom.md new file mode 100644 index 0000000000..c43d0433b7 --- /dev/null +++ b/quickstarts/analyze-data/deploy-and-monitor-machine-learning-models-with-teradata-modelops-and-byom.md @@ -0,0 +1,506 @@ +--- +sidebar_position: 6 +author: Pablo Escobar de la Oliva +email: pablo.escobardelaoliva@teradata.com +page_last_update: May 1st, 2024 +description: Tutorial for deploying and monitoring a PMML model into Vantage using ClearScape Analytics ModelOps +keywords: [modelops, byom, python, clearscape analytics, teradata, data warehouses, teradata, vantage, cloud data platform, machine learning, artificial intelligence, business intelligence, enterprise analytics] +--- + +import ClearscapeDocsNote from '../_partials/vantage_clearscape_analytics.mdx' + +# ModelOps - Import and Deploy your first BYOM Model + +## Overview + +This tutorial helps you to get started quickly using ClearScape Analytics ModelOps. We discuss key concepts briefly, so you can get right down to importing your first Bring-your-own-model (BYOM) models into ModelOps. In other tutorials in this quickstart site, you will have the opportunity to go deeper into other deployment and automation patterns with ClearSCape Analytics ModelOps. + +In this tutorial, you will learn: + +* What’s the difference between BYOM functions and ModelOps BYOM + +* Importing your first BYOM model in the Model Registry through the graphical user interface + +* Deploying the model in Vantage with automated scheduling and monitoring capabilities + +## Prerequisites + +We provide an associated notebook and sample data that you can import into your clearscape environment to access and run all of the code examples included in the quickstart. [Download the ModelOps sample notebooks and data](attachments/ModelOps_Quickstart_BYOM.zip) + +* Access to a Teradata Vantage instance with ClearScape Analytics (includes ModelOps) + +* Access to a Jupyter notebook environment or use the one available in ClearScape Analytics Experience: + + + + + + +## Key concepts you should know about first + +### Bring your own model (BYOM) in Teradata Vantage + +The Vantage Bring Your Own Model (BYOM) package gives data scientists and analysts the ability to operationalize predictive models in Vantage. Predictive models trained in external tools can be used to score data stored in Vantage using the BYOM Predict functions. + +Create or convert your predictive model using a supported model interchange format (PMML, MOJO, ONNX, Dataiku, and DataRobot are currently available), import it in a Vantage table, and use the BYOM PMMLPredict, H2OPredict, ONNXPredict, DataikuPredict, or DataRobotPredict to score your data with the model. + +### Bring your own model (BYOM) in Teradata Vantage with ModelOps + +In ModelOps the BYOM package is enriched with additional governance, automation, and monitoring capabilities for data scientists and machine learning engineers with the possibility of applying all of this without coding. In addition to the compatible formats of BYOM package, ModelOps extends the possibility to import and score models inside Vantage to Python scripts, R scripts and SAS scoring accelerator models. +Once you have your compatible model created or converted using a supported format (PMML, MOJO, ONNX, Dataiku, DataRobot, Python script, R script and SAS scoring accelerator model) then you can either use the ModelOps graphical user interface or the ModelOps code SDK to import into the model registry. + +### Understand where we will focus at the ModelOps methodology +In this tutorial, we will show you the end-to-end of this process using the associated Notebook and the ModelOps graphical user interface. + +![ModelOps Methodology BYOM screenshot](../modelops/images/byom_meth.png) + +## Steps in this Guide + +1. Create a project and connection (ModelOps) +2. Environment Setup (Notebook) +3. Creating datasets (ModelOps) +4. Train a model and export to PMML (Notebook) +5. Import the PMML into Vantage using BYOM functions (Notebook) +6. Import the PMML into Vantage using ModelOps Graphical user interface (ModelOps) +7. Go through Automated Lifecycle - Evaluation, Approve, Deploy (ModelOps) +8. Default and Custom alerting rules for Monitoring (ModelOps) +9. Custom Evaluation metrics and charts (Notebook) + +## 1. Create a project + +Login into ModelOps and navigate to the Projects screen. + +Click on the CREATE PROJECT button located on the top-right of the screen. We're using an cloned demo code in ModelOps with this path: /app/built-in/demo-models as git repository. Here we recommend you clone into your git repository instance the demo models public git: https://github.com/Teradata/modelops-demo-models.git in the branch "tmo" + +![ModelOps projects screenshot](../modelops/images/projects.png) + +Inside the Project creation sheet panel, include the following values: + +* Name: "BYOM Quickstart" + +* Description: "BYOM Quickstart" + +* Group: DEMO + +* Path: /app/built-in/demo-models + +* Credentials: No Credentials + +* Branch: tmo + +Click the TEST GIT CONNECTION button. If the test is succesful then click on save and continue. + +![ModelOps projects creating](../modelops/images/Project_Creating.png) + +## Create a Personal Connection + +In this guide we will skip creating a service connection, so click SAVE & CONTINUE and then NEXT to create a personal connection. + +![ModelOps projects save](../modelops/images/save_continue.png) +![ModelOps projects personal](../modelops/images/personal1.png) + +Inside the Personal Connection of the Projects creation sheet panel, include the following values: + +* Name: Quickstart Personal + +* Description: Quickstart Personal Connection + +* Host: ClearScape-url + +* Database: "demo_user" + +* VAL Database Name: "VAL" + +* BYOM Database Name: "MLDB" + +* Login Mechanism: "TDNEGO" + +* Username: demo_user + +* Pasword: your-password + +Test the Vantage connection by clicking on the TEST CONNECTION button. + +Click save. + +![ModelOps connection](../modelops/images/Personal_Connection.png) + +This is how the Projects panel will show with the new project created: + +![ModelOps projects with quickstart screenshot](../modelops/images/projects_quickstart.png) + +## Connection Healthcheck panel + +Enter into the project by clicking on it, and get inside Settings on the Left-hand menu. Use View details from your connection + +![ModelOps view](../modelops/images/view_details.png) + +Then you should get the healthcheck panel, where it will show if SQLE, BYOM and VAL associated rights are enabled for this connection user. If there is any error here, contact your dba to apply the specific rights. Review the onboarding bteq script that comes in the attached files of the quickstart for the specific GRANT commands that are required. + +![ModelOps healthcheck](../modelops/images/healthcheck.png) + +## 2. Environment Setup (Notebook) + +Follow the Notebook attached in this quickstart to perform the envrionnment setup and checks at the database level. + +## 3. Creating datasets (ModelOps) + +Click on your newly created project and then click on the Datasets button located on the left-hand menu. Click on CREATE DATASET TEMPLATE. + +![ModelOps dataset](../modelops/images/dataset_template.png) + + +Enter the following values: + +* Name: dataset + +* Description: dataset + +* Feature Catalog: Vantage + +* Database: your-db + +* Table: aoa_statistics_metadata + +![ModelOps dataset edit](../modelops/images/dataset_template2.png) + +Click next and enter the Features Query: This query will be used to identify the features table, you can also Validate statistics and preview Data: + +``` sql +SELECT * FROM pima_patient_features +``` + +![ModelOps dataset features](../modelops/images/dataset_template_features.png) + + + +Continue to Entity & Target and include the query: This query will be used to join with the features based on the same entity and to filter the rows of the Training, Evaluation and Scoring Datasets. + +You need to select HasDiabetes as the target variable from this query, then Validate Statistics + +``` sql +SELECT * FROM pima_patient_diagnoses +``` + +![ModelOps dataset features](../modelops/images/dataset_template_target.png) + + +Continue to Predictions and include the details of the database, table, and the query: This query will be used as the Input of the execution of your model in Production when this model will be deployed as BATCH (Note: BYOM models can only be deployed as batch in ModelOps version 7) + +* Database: your-db + +* Table: pima_patient_predictions + +* Query: + +``` sql +SELECT * FROM pima_patient_features WHERE patientid MOD 5 = 0 +``` + +![ModelOps dataset features](../modelops/images/dataset_template_prediction.png) + +### Create Training dataset + +Click on create dataset, Enter the name and description and Select training and click next. + +This query we want to filter and get 80% of rows of the dataset, we use `MOD 5 <> 0` to get this: + +``` sql +SELECT * FROM pima_patient_diagnoses WHERE patientid MOD 5 <> 0 +``` + +![ModelOps dataset basic](../modelops/images/training_dataset_basic.png) +![ModelOps dataset training](../modelops/images/training_dataset.png) + +Confirm the query and click on create. + +### Create Evaluation dataset + +Click on create dataset, Enter the name and description and Select evaluation and click next. + +This query we want to filter and get 20% of rows of the dataset, we use MOD 5 = 0 to get this: + +``` sql +SELECT * FROM pima_patient_diagnoses WHERE patientid MOD 5 = 0 +``` + +![ModelOps eval dataset](../modelops/images/evaluation_dataset_basic.png) +![ModelOps eval dataset details](../modelops/images/evaluation_dataset.png) + + +Confirm the query and click on create. + +This is how it should show both datasets for Training and Evaluation + +![datasets_created](../modelops/images/datasets_created.png) + +## 4. Train a model and export to PMML (Notebook) + +Follow the Notebook attached in this quickstart to perform the model training, conversion and download the model pmml file for following steps. + +## 5. Import the PMML into Vantage using BYOM functions (Notebook) + +Follow the Notebook attached in this quickstart to use and understand the BYOM package functions, this way will publish the models in Vantage, but not in the ModelOps registry and we will not have governance, automation or monitoring capabilities. + +## 6. Import the PMML into Vantage using ModelOps Graphical user interface (ModelOps) + +### Import into ModelOps + +Go to Models at the left-hand menu and click on DEFINE BYOM MODEL + +![ModelOps define new model](../modelops/images/define_new.png) + +Fill the fields with this values as example: + +* Name: byom + +* Description: byom + +* Format: PMML + +Click on Save Model & Import versions + +![ModelOps define new byom model](../modelops/images/byom_basic.png) + +Fill the field for external id to track it from the training tool, and upload the model.pmml file - NOTE It has to be this exact name: model.pmml + +* External id: 001 + +* model file: model.pmml + +![ModelOps define new byom model](../modelops/images/byom_model.png) + +### Enable default automated Evaluation and Monitoring + +In this screen we are going to keep marked the Enable Monitoring capabily. + +We need to select the training dataset that was used for this model pmml when training. We have already created this dataset before, so we select + +Then we press on VALIDATE. + +BYOM predict functions generate an output based on a JSON, and this is different for every BYOM model. We need to know the specific field that is the target/output of our prediction. In order to use it in our evaluation logic and generate model metrics (accuracy, precision, etc.). For this we require a CAST expression on the JSON output file. + +We have included a Generate Link to help us on validating and implementing this CAST expression. So click on the Generate button to move into the helper screen and get the expression + +![ModelOps monitoring1](../modelops/images/byom_monitoring1.png) + +Now select the target/output variable of our prediction. In this demo case is: predicted_HasDiabetes. + +Click on Save and let the helper copy the expression for you. + +![[ModelOps monitoring2](../modelops/images/byom_monitoring2.png), width=50%] + +This is the CAST expression, Click on Save on the dialog: +CAST(CAST(json_report AS JSON).JSONExtractValue('$.predicted_HasDiabetes') AS INT) + +![ModelOps monitoring save](../modelops/images/byom_monitoring_save.png) + +Now you can validate the Cast Expression and click on Save: + +![ModelOps monitoring save](../modelops/images/byom_monitoring_3.png) + +A new job for MODEL IMPORT and another job for COMPUTE STATISTICS will run for few minutes. + +![ModelOps monitoring save](../modelops/images/statistics_job.png) + +## 7. Go through Automated Lifecycle - Evaluation, Approve, Deploy (ModelOps) + +### Evaluate the model version in ModelOps + +After finishing the jobs a new model version will be available in the Model version catalog of this byom model like the following image. Click on the model version to get inside Lifecycle: + +![ModelOps lifecycle](../modelops/images/model_version.png) + +The model is in IMPORT stage. we can now evaluate the model, click EVALUATE to run the automated default evaluation job + +![ModelOps evaluate](../modelops/images/model_evaluate.png) + +Select the evaluation dataset and click on EVALUATE MODEL. + +![ModelOps evaluate dataset](../modelops/images/model_evaluate2.png) + +This will create a new Job for the Evaluation and will show the log. These screen can be closed at the X button at the top-right. + +![ModelOps evaluation job](../modelops/images/evaluation_job.png) + +You can access at any time at the left-hand menu JOBS screen. to go again into the log you just need to click on the 3 dots of the job and VIEW DETAILS. This is how it should look: + +![ModelOps evaluation job](../modelops/images/jobs.png) + +Once the job is finished, model will be in the EVALUATE stage in the lifecycle screen. Go to your model version to see it. + +You can check all the details of the evaluation step, including an evaluation REPORT, where you will see metrics and Charts that the default Evaluation logic has generated. NOTE: These metrics are default for Classification and Regression models and can be customized with a coded template that will share later in the quickstart. + +![ModelOps evaluation lifecycle](../modelops/images/evaluation_report.png) +![ModelOps evaluation lifecycle](../modelops/images/evaluation_report2.png) + +### Approve the model version + +Once the model version is evaluated, it is ready to be approved or rejected. This approval can be done through model lifecycle screen, in the model report screen and it can also be done through REST API integrating an external tool like Jira/BPM case management systems. + +Let's get into the Approval dialog and include the following description, as an example: + +* Approval comment: Go for Production + +![ModelOps approval](../modelops/images/go.png) + +### Deploy the model version and schedule scoring + +to deploy the model you need to use the DEPLOY button in the model lifecycle screen. + +![ModelOps deploy](../modelops/images/deploy.png) + +For BYOM models the deployment target available is In-Vantage, as we want to leverage the BYOM predict functions in Vantage: + +![ModelOps deploy](../modelops/images/deploy_details1.png) + +Publish the model: Select the connection to Vantage that will be used to publish the model, the database and the table. Here we will use our created connection and the table we created for storing BYOM models: aoa_byom_models. Click Next after including these details + +* Connection: personal + +* Database: demo_user + +* Table: aoa_byom_models + +![ModelOps deploy2](../modelops/images/deploy_details2.png) + +Now in the Scheduling step, you are able to enable scheduling and select what is the frequency/cadence of this scoring. Keep marked the Enable Scheduling checkbox and select "Manual" in this demo, inside clearscape.teradata.com in order to save resources the scheduling options are disabled. Any scheduling option is available since we can include a CRON expression. + +In this screen we will also select the dataset template to be used when scoring the model in production. The Prediction details of the dataset will be used such as the Input query, and output prediction table that we defined in the Datasets step. + +Click on Deploy to finalize this step + +![ModelOps deploy3](../modelops/images/deploy_details3.png) + +A new Deployment job will be running by the ModelOps Agent. once this is finished a new deployment will be available in the Deployments section of the left-hand menu. + +![ModelOps deploy job](../modelops/images/deploy_job.png) + +### Deployment details including history of jobs, feature/prediction drift and performance monitoring + +Go to the left-hand menu Deployments, and see the new deployment from the BYOM model is available, click on it to see the details and go to the Jobs tab + +![[ModelOps deployments](../modelops/images/deployments.png), width=50%] + +In the Jobs tab you will see the history of executions of this model deployed. Let's run now a new scoring using the Run now button. This button can be also scheduled externally through REST APIs + +![ModelOps deployments](../modelops/images/deployment_jobs.png) + +After executing the scoring job, it should look like this: + +![ModelOps deployments](../modelops/images/deployment_jobs2.png) + +And we can get into the output details of this job, by clicking on the three dots at the right, and view predictions + +![ModelOps deployments](../modelops/images/deployment_predictions.png) + +Now that we have run a job in production, the default Monitoring capabilities are enabled, you can check both feature and prediction drift to see individually per feature the histogram calculation and the Population Stability Index (PSI) KPI for drift monitoring + +![ModelOps feature drift deployments](../modelops/images/feature_drift.png) +![ModelOps prediction drift deployments](../modelops/images/prediction_drift.png) + +In the Performance metrics tab, we see that there is only a single metric data point, this is because performance monitoring relies on Evaluation jobs. So let's create a new dataset and run a new evaluation at this deployment to simulate we have new fresh data and want to check on the performance of my model by comparing the metrics with the previous evaluation. + +### Performance monitoring with new dataset + +Let's create a new evaluation dataset in Datasets left-hand menu. + +We will use the same dataset template that we created and will create a new dataset with the following details + +* Name: evaluation2 + +* Description: evaluation2 + +* Scope: evaluation + +![ModelOps evaluation2](../modelops/images/evaluation2.png) + +And let's simulate the new evaluation with a new dataset query + +``` sql +SELECT * FROM pima_patient_features WHERE patientid MOD 10 = 0 +``` + +And click on create to generate new dataset for evaluation + +![ModelOps evaluation detail](../modelops/images/evaluation2_detail.png) + +Now you can go back to your deployment to evaluate the model version deployed: + +![ModelOps evaluation detail](../modelops/images/deployment_evaluate.png) + +Use the new dataset created in the Evaluation job panel: + +* Dataset template: dataset + +* Dataset: evaluate2 + +and click on EVALUATE model + +![ModelOps evaluation detail](../modelops/images/deployment_evaluate2.png) + +Once the Evaluation job is finished, then the performance metrics will show a new set of metrics with the new dataset used: + +![ModelOps performance monitoring](../modelops/images/performance.png) + +## 8. Default and Custom alerting rules for Monitoring (ModelOps) + +### Enabling alerting + +Default Alerts in ModelOps are activated at the models screen, There is a Enable Alerts column in this table, activate it to start with default alerting + +![ModelOps enabling alerts](../modelops/images/enable_alerts.png) + +Once this alerts are enabled you can check on the definition of the default alert, by getting inside the model and getting into the ALERT tab: + +![ModelOps configuring alert](../modelops/images/alert_configuration.png) + +### Updating alerting rules + +We can create new alerts, like new rules for performance monitoring or update default alerting rules. + +Let's do an alert edit, on the feature drift monitoring. click on the alert edit + +![ModelOps configuring alert2](../modelops/images/alert_configuration2.png) + +Here you can update the fields. Let's update the value treshold from 0.2 to 0.18 and click on UPDATE + +![ModelOps configuring alert3](../modelops/images/alert_configuration3.png) + +After editing the rule, your alerts screen should look like this: + +![ModelOps configuring alert4](../modelops/images/alert_configuration4.png) + +### Reviewing alerts + +Now that we have alert edited, we should wait 1 minute till we get a new alert into the ModelOps tool. This alert can be configured to send an email to a set of email addresses as well. + +Now we have received the alert, we can see a red circle in the alerts at the left-hand menu + +We can directly access to the model version from this screen by clicking on the modelid + +![ModelOps new alert1](../modelops/images/alert_new1.png) + +Once we are in the model lifecycle screen, we see a direct access to Model Drift, let's get inside + +![ModelOps new alert2](../modelops/images/alert_new2.png) + +Then we can see the individual features in red in the feature drift tab of my deployed model. This alert is indicating that the latest scoring data is drifted from the training data with that value of population stability index(PSI). And teams can then make proactive actions to evaluate the drift of the model and replace the model in production if is needed + +![ModelOps new alert3](../modelops/images/alert_new3.png) + +## 9. Custom Evaluation metrics and charts (Notebook) + +Follow the Notebook attached in this quickstart to understand the methodology for creating custom Evaluation logic, metrics and charts + +## Summary + +In this quick start we have learned what is the difference between BYOM functions and ModelOps BYOM pattern, How to import models with ModelOps graphical user interface, and how to automate the scoring and monitoring of the model getting Data Drift and Model QUality metrics alerts + +## Further reading + +[ClearScape Analytics ModelOps User Guide](https://docs.teradata.com/search/documents?query=ModelOps&sort=last_update&virtual-field=title_only&content-lang=) + +import CommunityLinkPartial from '../_partials/community_link.mdx'; + + diff --git a/quickstarts/analyze-data/deploy-and-monitor-machine-learning-models-with-teradata-modelops-and-git.md b/quickstarts/analyze-data/deploy-and-monitor-machine-learning-models-with-teradata-modelops-and-git.md new file mode 100644 index 0000000000..9a77369c36 --- /dev/null +++ b/quickstarts/analyze-data/deploy-and-monitor-machine-learning-models-with-teradata-modelops-and-git.md @@ -0,0 +1,203 @@ +--- +sidebar_position: 7 +author: Pablo Escobar de la Oliva +email: pablo.escobardelaoliva@teradata.com +page_last_update: May 29th, 2022 +description: Tutorial for deploying and monitoring a Python model into Vantage using ModelOps and Git repository +keywords: [modelops, python, git, clearscape analytics, teradata, data warehouses, teradata, vantage, cloud data platform, machine learning, artificial intelligence, business intelligence, enterprise analytics] +--- +import ClearscapeDocsNote from '../_partials/vantage_clearscape_analytics.mdx' +import ModelOpsBasic from '../_partials/modelops-basic.mdx' + +# ModelOps - Import and Deploy your first GIT Model + +## Overview + +This is a how-to for people who are new to ClearScape Analytics ModelOps. In the tutorial, you will be able to create a new project in ModelOps, upload the required data to Vantage, and track the full lifecycle of a demo model using code templates and following the methodology for GIT models in ModelOps. + +## Prerequisites + +* Access to a Teradata Vantage instance with ClearScape Analytics (includes ModelOps) + +* Ability to run Jupyter notebooks + + + +Files needed + +Let's start by downloading the needed files for this tutorial. Download these 4 attachments and upload them in your Notebook filesystem. Select the files depending on your version of ModelOps: + +ModelOps version 6 (October 2022): + +[Download the ModelOps training Notebook](../modelops/attachments/ModelOps_Training_v6.ipynb) + +[Download BYOM Notebook file for demo use case](../modelops/attachments/BYOM_v6.ipynb) + +[Download data files for demo use case](../modelops/attachments/ModelOps_Data_files_v6.zip) + +[Download BYOM code files for demo use case](../modelops/attachments/ModelOps_BYOM_files_v6.zip) + +Alternatively you can git clone following repos +``` bash +git clone https://github.com/willfleury/modelops-getting-started +git clone https://github.com/Teradata/modelops-demo-models/ +``` + +ModelOps version 7 (April 2023): + +[Download the ModelOps training Notebook](../modelops/attachments/ModelOps_Training_v7.ipynb) + +[Download BYOM Notebook file for demo use case](../modelops/attachments/BYOM_v7.ipynb) + +[Download data files for demo use case](../modelops/attachments/ModelOps_Data_files_v7.zip) + +[Download BYOM code files for demo use case](../modelops/attachments/ModelOps_BYOM_files_v7.zip) + +``` bash +git clone -b v7 https://github.com/willfleury/modelops-getting-started.git +git clone https://github.com/Teradata/modelops-demo-models/ +``` + +Setting up the database and Jupyter environment + +Follow the ModelOps_Training Jupyter Notebook to setup the database, tables and libraries needed for the demo. + +## Understand where we are in the Methodology + +![ModelOps Methodology GIT screenshot](../modelops/images/modelops-git.png) + + + + +## Prepare code templates + +For Git Models we need to fill the code templates available when adding a new model. + +These code scripts will be stored in the git repository under: model_definitions/your-model/model_modules/ + +* __init__.py : this an empty file required for python modules + +* training.py: this script contains train function + +``` python +def train(context: ModelContext, **kwargs): + aoa_create_context() + + # your training code + + # save your model + joblib.dump(model, f"{context.artifact_output_path}/model.joblib") + + record_training_stats(...) +``` + +Review the Operationalize notebook to see how you can execute this from CLI or from notebook as an alternative to ModelOps UI. + +* evaluation.py: this script contains evaluate function + +``` python +def evaluate(context: ModelContext, **kwargs): + aoa_create_context() + + # read your model + model = joblib.load(f"{context.artifact_input_path}/model.joblib") + + # your evaluation logic + + record_evaluation_stats(...) +``` + +Review the Operationalize notebook to see how you can execute this from CLI or from notebook as an alternative to ModelOps UI. + +* scoring.py: this script contains score function + +``` python +def score(context: ModelContext, **kwargs): + aoa_create_context() + + # read your model + model = joblib.load(f"{context.artifact_input_path}/model.joblib") + + # your evaluation logic + + record_scoring_stats(...) +``` + +Review the Operationalize notebook to see how you can execute this from CLI or from notebook as an alternative to ModelOps UI. + +* requirements.txt: this file contains the library names and versions required for your code scripts. Example: + +``` python +%%writefile ../model_modules/requirements.txt +xgboost==0.90 +scikit-learn==0.24.2 +shap==0.36.0 +matplotlib==3.3.1 +teradataml==17.0.0.4 +nyoka==4.3.0 +aoa==6.0.0 +``` + +* config.json: this file located in the parent folder (your-model folder) contains default hyper-parameters + +``` python +%%writefile ../config.json +{ + "hyperParameters": { + "eta": 0.2, + "max_depth": 6 + } +} +``` + +Go and review the code scripts for the demo model in the repository: https://github.com/Teradata/modelops-demo-models/ + +Go into model_definitions->python-diabetes->model_modules + +## Model Lifecycle for a new GIT + +* Open Project to see models available from GIT + +* Train a new model version + +* see how CommitID from code repository is tracked + +* Evaluate + +* Review evaluation report, including dataset statistics and model metrics + +* Compare with other model versions + +* Approve + +* Deploy in Vantage - Engine, Publish, Schedule. Scoring dataset is required +Use your connection and select a database. e.g "aoa_byom_models" + +* Deploy in Docker Batch - Engine, Publish, Schedule. Scoring dataset is required +Use your connection and select a database. e.g "aoa_byom_models" + +* Deploy in Restful Batch - Engine, Publish, Schedule. Scoring dataset is required +Use your connection and select a database. e.g "aoa_byom_models" + +* Deployments/executions + +* Evaluate again with dataset2 - to monitor model metrics behavior + +* Monitor Model Drift - data and metrics + +* Open BYOM notebook to execute the PMML predict from SQL code when deployed in Vantage + +* Test Restful from ModelOps UI or from curl command + +* Retire deployments + +## Summary + +In this quick start we have learned how to follow a full lifecycle of GIT models into ModelOps and how to deploy it into Vantage or into Docker containers for Edge deployments. Then how we can schedule a batch scoring or test restful or on-demand scorings and start monitoring on Data Drift and Model Quality metrics. + +## Further reading +* [+++ModelOps documentation+++](https://docs.teradata.com/search/documents?query=ModelOps&sort=last_update&virtual-field=title_only&content-lang=). + +import CommunityLinkPartial from '../_partials/community_link.mdx'; + + \ No newline at end of file diff --git a/quickstarts/analyze-data/execute-airflow-workflows-with-clearscape-analytics-modelops-model-factory-solution.md b/quickstarts/analyze-data/execute-airflow-workflows-with-clearscape-analytics-modelops-model-factory-solution.md new file mode 100644 index 0000000000..4c7785c77c --- /dev/null +++ b/quickstarts/analyze-data/execute-airflow-workflows-with-clearscape-analytics-modelops-model-factory-solution.md @@ -0,0 +1,613 @@ +--- +sidebar_position: 8 +author: Tayyaba Batool +email: tayyaba.batool@teradata.com +page_last_update: Mar 19th, 2024 +description: Tutorial for Model Factory Solution - Executing Airflow workflows with ClearScape Analytics ModelOps +keywords: [modelfactory, modelops, byom, python, clearscape analytics, teradata, data warehouses, teradata, vantage, cloud data platform, machine learning, artificial intelligence, business intelligence, enterprise analytics] +--- +import ClearscapeDocsNote from '../_partials/vantage_clearscape_analytics.mdx' + +# Execute Airflow workflows with ModelOps - Model Factory Solution Accelerator + +## Overview + +The purpose of the **Model Factory Solution Accelerator** of **ClearScape Analytics** is to streamline and accelerate the end-to-end process of developing, deploying, and managing machine learning models within an organization at **Horizontal Scale** by operationalizing **hundreds of models for a business domain at one effort**. It leverages the scalability of in-database analytics and the openness of supporting partner model formats such as H2O or Dataiku. This unique combination enhances efficiency, scalability, and consistency across various stages of the machine learning lifecycle in Enterprise environments. + +By incorporating best practices, automation, and standardized workflows, the Model Factory Solution Accelerator enables teams to rapidly select the data to be used, configure the model required, ensure reproducibility, and deploy **unlimited** number of models seamlessly into production. Ultimately, it aims to reduce the time-to-value for machine learning initiatives and promote a more structured and efficient approach to building and deploying models at scale. Here is the diagram of an automated Workflow: + +![Workflow](../modelops/images/execute-airflow-workflows-with-clearscape-analytics-modelops-model-factory-solution/Workflow.png) + +Here are the steps to implement Model Factory Solution Accelerator using Airflow and ClearScape Analytics ModelOps. Apache Airflow is used for the scheduling and orchestration of data pipelines or workflows. So in this tutorial we are creating an Airflow DAG (Directed Acyclic Graph) which will be executed to automate the lifecycle of ModelOps. + +## Prerequisites + +* In this tutorial it is implemented on local machine using **Visual Studio code** IDE. + +In order to execute shell commands, you can install the VS code extension **"Remote Development"** using the followng link. This extension pack includes the WSL extension, in addition to the Remote - SSH, and Dev Containers extensions, enabling you to open any folder in a container, on a remote machine, or in WSL: +[+++VS code marketplace+++](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.vscode-remote-extensionpack). + +* Access to a Teradata Vantage instance with ClearScape Analytics (includes ModelOps) + + + + + + +## Configuring Visual Studio Code and Installing Airflow on docker-compose + +* Open Visual Studio code and select the option of open a remote window. Then select Connect to WSL-Ubuntu + +* Select File > Open Folder. Then select the desired folder or create a new one using this command: mkdir [folder_name] + +* Set the AIRFLOW_HOME environment variable. Airflow requires a home directory and uses ~/airflow by default, but you can set a different location if you prefer. The AIRFLOW_HOME environment variable is used to inform Airflow of the desired location. + +``` bash , id="set Airflow Home directory", role="content-editable emits-gtm-events" +AIRFLOW_HOME=./[folder_name] +``` + +* Install apache-airflow stable version 2.8.2 from PyPI repository.: + +``` bash , id="Install Airflow", role="content-editable emits-gtm-events" + AIRFLOW_VERSION=2.8.2 + + PYTHON_VERSION="$(python3 --version | cut -d " " -f 2 | cut -d "." -f 1-2)" + + CONSTRAINT_URL="https://raw.githubusercontent.com/apache/airflow/constraints-${AIRFLOW_VERSION}/constraints-${PYTHON_VERSION}.txt" + + pip install "apache-airflow==${AIRFLOW_VERSION}" --constraint "${CONSTRAINT_URL}" --default-timeout=100 +``` + +* Install the Airflow Teradata provider stable version from PyPI repository. + +``` bash , id="Install Airflow Teradata", role="content-editable emits-gtm-events" +pip install "apache-airflow-providers-teradata" --default-timeout=100 +``` + +* Install Docker Desktop so that you can use docker container for running airflow. Ensure that the docker desktop is running. + +* Check docker version using this command: + +``` bash , id="Check Docker version", role="content-editable emits-gtm-events" +docker --version +``` + +Check the version of docker compose. Docker Compose is a tool for defining and running multi-container applications + +``` bash , id="Check Docker compose version", role="content-editable emits-gtm-events" +docker-compose --version +``` + +To deploy Airflow on Docker Compose, you need to fetch docker-compose.yaml using this curl command. + +``` bash , id="Fetch docker-compose yaml", role="content-editable emits-gtm-events" + curl -LfO 'https://airflow.apache.org/docs/apache-airflow/2.8.2/docker-compose.yaml' +``` + +Create these folders to use later using following command: + +``` bash , id="Create Airflow folders", role="content-editable emits-gtm-events" +mkdir -p ./dags ./logs ./plugins ./config +``` + + +## Configuring Model Factory Solution Accelerator + +Create a config file inside config folder and set the parameters to corresponding values depending on which model you want to train. + +
+ +Click to reveal the Python code + +
+``` python , id="Model Factory Solution Config File", role="content-editable emits-gtm-events"
+from configparser import ConfigParser
+import os
+
+config = ConfigParser()
+
+config['MAIN'] = {
+    "projectId": "23e1df4b-b630-47a1-ab80-7ad5385fcd8d",
+    "bearerToken": os.environ['BEARER_TOKEN'],
+    "trainDatasetId": "ba39e766-2fdf-426f-ba5c-4ca3e90955fc",
+    "evaluateDatasetId": "74489d62-2af5-4402-b264-715e151a420a",
+    "datasetConnectionId" : "151abf05-1914-4d38-a90d-272d850f212c",
+    "datasetTemplateId": "d8a35d98-21ce-47d0-b9f2-00d355777de1"
+}
+
+config['HYPERPARAMETERS'] = {
+    "eta": 0.2,
+    "max_depth": 6
+}
+
+config['RESOURCES'] = {
+    "memory": "500m",
+    "cpu": "0.5"
+}
+
+config['MODEL'] = {
+    "modelId": "f937b5d8-02c6-5150-80c7-1e4ff07fea31",
+    "approvalComments": "Approving this model!",
+    "cron": "@once",
+    "engineType": "DOCKER_BATCH",
+    "engine": "python-batch",
+    "dockerImage": "artifacts.td.teradata.com/tdproduct-docker-snapshot/avmo/aoa-python-base:3.9.13-1"
+}
+
+
+with open('./config/modelOpsConfig.ini', 'w') as f:
+    config.write(f)
+```
+
+
+Now copy the Bearer token from the ModelOps user interface (Left Menu -> Your Account -> Session Details) and set it here as an environment varibale using the following command: + +``` bash , id="Bearer token", role="content-editable emits-gtm-events" +export BEARER_TOKEN='your_token_here' +``` + +Now you can execute the previously created config file, which will create a new ini file inside config folder containing all the required parameters which will be used in the DAG creation step. + +``` python , id="Create config ini", role="content-editable emits-gtm-events" +python3 createConfig.py +``` + +## Create a Airflow DAG containing full ModelOps Lifecycle + +Now you can create a DAG using the following python code. Add this python code file inside dags folder. This DAG contains 5 tasks of ModelOps lifecycle (i.e., Train, Evaluate, Approve, Deploy and Retire) + +.Click to reveal the Python code +[%collapsible] +==== +``` python , id="DAG Code", role="content-editable emits-gtm-events" +import base64 +from datetime import datetime, timedelta, date +import json +import os +import time + +from airflow import DAG +from airflow.operators.python import PythonOperator + +import requests + +from configparser import ConfigParser + +# Read from Config file +config = ConfigParser() +config.read('config/modelOpsConfig.ini') + +config_main = config["MAIN"] +config_hyper_params = config["HYPERPARAMETERS"] +config_resources = config["RESOURCES"] +config_model = config["MODEL"] + +# Default args for DAG +default_args = { + 'owner': 'Tayyaba', + 'retries': 5, + 'retry_delay': timedelta(minutes=2) +} + +def get_job_status(job_id): + + # Use the fetched Job ID to check Job Status + headers_for_status = { + 'AOA-PROJECT-ID': config_main['projectid'], + 'Authorization': 'Bearer ' + config_main['bearertoken'], + } + + status_response = requests.get('https://airflow-u9usja4twtauvt3s.env.clearscape.teradata.com:8443/modelops/core/api/jobs/' + job_id + '?projection=expandJob', headers=headers_for_status) + status_json = status_response.json() + job_status = status_json.get('status') + return job_status + + +def train_model(ti): + + headers = { + 'AOA-Project-ID': config_main['projectid'], + 'Accept': 'application/json, text/plain, */*', + 'Accept-Language': 'en-US,en;q=0.9', + 'Authorization': 'Bearer ' + config_main['bearertoken'], + 'Content-Type': 'application/json', + } + + json_data = { + 'datasetId': config_main['trainDatasetId'], + 'datasetConnectionId': config_main['datasetConnectionId'], + 'modelConfigurationOverrides': { + 'hyperParameters': { + 'eta': config_hyper_params['eta'], + 'max_depth': config_hyper_params['max_depth'], + }, + }, + 'automationOverrides': { + 'resources': { + 'memory': config_resources['memory'], + 'cpu': config_resources['cpu'], + }, + 'dockerImage': config_model['dockerImage'], + }, + } + + + response = requests.post('https://airflow-u9usja4twtauvt3s.env.clearscape.teradata.com:8443/modelops/core/api/models/' + config_model['modelid'] + '/train', headers=headers, json=json_data) + + json_data = response.json() + + # Get the Training Job ID + job_id = json_data.get('id') + ti.xcom_push(key='train_job_id', value=job_id) + + job_status = get_job_status(job_id) + print("Started - Training Job - Status: ", job_status) + + while job_status != "COMPLETED": + if job_status=="ERROR": + print("The training job is terminated due to an Error") + ti.xcom_push(key='trained_model_id', value='NONE') # Setting the Trained Model Id to None here and check in next step (Evaluate) + break + elif job_status=="CANCELLED": + ti.xcom_push(key='trained_model_id', value='NONE') + print("The training job is Cancelled !!") + break + print("Job is not completed yet. Current status", job_status) + time.sleep(5) #wait 5s + job_status = get_job_status(job_id) + + # Checking Job status at the end to push the correct trained_model_id + if(job_status == "COMPLETED"): + train_model_id = json_data['metadata']['trainedModel']['id'] + ti.xcom_push(key='trained_model_id', value=train_model_id) + print('Model Trained Successfully! Job ID is : ', job_id, 'Trained Model Id : ', train_model_id, ' Status : ', job_status) + else: + ti.xcom_push(key='trained_model_id', value='NONE') + print("Training Job is terminated !!") + + +def evaluate_model(ti): + + trained_model_id = ti.xcom_pull(task_ids = 'task_train_model', key = 'trained_model_id') + + headers = { + 'AOA-Project-ID': config_main['projectid'], + 'Accept': 'application/json, text/plain, */*', + 'Accept-Language': 'en-US,en;q=0.9', + 'Authorization': 'Bearer ' + config_main['bearertoken'], + 'Content-Type': 'application/json', + } + + json_data = { + 'datasetId': config_main['evaluatedatasetid'], + 'datasetConnectionId': config_main['datasetConnectionId'], + 'modelConfigurationOverrides': { + 'hyperParameters': { + 'eta': config_hyper_params['eta'], + 'max_depth': config_hyper_params['max_depth'], + }, + }, + 'automationOverrides': { + 'resources': { + 'memory': config_resources['memory'], + 'cpu': config_resources['cpu'], + }, + 'dockerImage': config_model['dockerImage'], + }, + } + + if trained_model_id == 'NONE': + ti.xcom_push(key='evaluated_model_status', value='FALIED') + print("Evaluation cannot be done as the Training Job was terminated !!") + else: + response = requests.post('https://airflow-u9usja4twtauvt3s.env.clearscape.teradata.com:8443/modelops/core/api/trainedModels/' + trained_model_id + '/evaluate', headers=headers, json=json_data) + json_data = response.json() + + # Get the Evaluation Job ID + eval_job_id = json_data.get('id') + ti.xcom_push(key='evaluate_job_id', value=eval_job_id) + + job_status = get_job_status(eval_job_id) + print("Started - Job - Status: ", job_status) + + while job_status != "COMPLETED": + if job_status=="ERROR": + print("The evaluation job is terminated due to an Error") + # Set the Trained Model Id to None here and check in next step (Evaluate) + break + elif job_status=="CANCELLED": + print("The evaluation job is Cancelled !!") + break + print("Job is not completed yet. Current status", job_status) + time.sleep(5) # wait 5s + job_status = get_job_status(eval_job_id) + + # Checking Job status at the end to push the correct evaluate_job_id + if(job_status == "COMPLETED"): + ti.xcom_push(key='evaluated_model_status', value='EVALUATED') + print('Model Evaluated Successfully! Job ID is : ', eval_job_id, ' Status : ', job_status) + else: + ti.xcom_push(key='evaluated_model_status', value='FAILED') + print("Evaluation Job is terminated !!") + + +def approve_model(ti): + + evaluated_model_status = ti.xcom_pull(task_ids = 'task_evaluate_model', key = 'evaluated_model_status') + + if evaluated_model_status == 'FAILED': + ti.xcom_push(key='approve_model_status', value='FALIED') + print("Approval cannot be done as the Evaluation was failed !!") + else: + trained_model_id = ti.xcom_pull(task_ids = 'task_train_model', key = 'trained_model_id') + + headers = { + 'AOA-Project-ID': config_main['projectid'], + 'Accept': 'application/json, text/plain, */*', + 'Accept-Language': 'en-US,en;q=0.9', + 'Authorization': 'Bearer ' + config_main['bearertoken'], + 'Content-Type': 'application/json', + } + + json_data = { + "comments": (base64.b64encode(config_model['approvalComments'].encode()).decode()) + } + + response = requests.post('https://airflow-u9usja4twtauvt3s.env.clearscape.teradata.com:8443/modelops/core/api/trainedModels/' + trained_model_id + '/approve' , headers=headers, json=json_data) + response_json = response.json() + approval_status = response_json['status'] + if(approval_status == 'APPROVED'): + ti.xcom_push(key='approve_model_status', value='EVALUATED') + print('Model Approved Successfully! Status: ', approval_status) + else: + ti.xcom_push(key='approve_model_status', value='FAILED') + print('Model not approved! Status: ', approval_status) + + +def deploy_model(ti): + + approve_model_status = ti.xcom_pull(task_ids = 'task_approve_model', key = 'approve_model_status') + + headers = { + 'AOA-Project-ID': config_main['projectid'], + 'Accept': 'application/json, text/plain, */*', + 'Accept-Language': 'en-US,en;q=0.9', + 'Authorization': 'Bearer ' + config_main['bearertoken'], + 'Content-Type': 'application/json', + } + + + json_data = { + 'engineType': config_model['engineType'], + 'engineTypeConfig': { + 'dockerImage': config_model['dockerImage'], + 'engine': "python-batch", + 'resources': { + 'memory': config_resources['memory'], + 'cpu': config_resources['cpu'], + } + }, + 'language':"python", + 'datasetConnectionId': config_main['datasetConnectionId'], + 'datasetTemplateId': config_main['datasetTemplateId'], + 'cron': config_model['cron'], + 'publishOnly': "false", + 'args':{} + } + + if approve_model_status == 'FAILED': + ti.xcom_push(key='deploy_model_status', value='FALIED') + print("Deployment cannot be done as the model is not approved !!") + else: + trained_model_id = ti.xcom_pull(task_ids = 'task_train_model', key = 'trained_model_id') + + response = requests.post('https://airflow-u9usja4twtauvt3s.env.clearscape.teradata.com:8443/modelops/core/api/trainedModels/' + trained_model_id + '/deploy', headers=headers, json=json_data) + json_data = response.json() + + # Get the Deployment Job ID + deploy_job_id = json_data.get('id') + ti.xcom_push(key='deploy_job_id', value=deploy_job_id) + + # deployed_model_id = json_data['metadata']['deployedModel']['id'] + + job_status = get_job_status(deploy_job_id) + print("Started - Deployment Job - Status: ", job_status) + + while job_status != "COMPLETED": + if job_status=="ERROR": + ti.xcom_push(key='deploy_model_status', value='FAILED') + print("The deployment job is terminated due to an Error") + break + elif job_status=="CANCELLED": + ti.xcom_push(key='deploy_model_status', value='FAILED') + print("The deployment job is Cancelled !!") + break + print("Job is not completed yet. Current status", job_status) + time.sleep(5) # wait 5s + job_status = get_job_status(deploy_job_id) + + # Checking Job status at the end to push the correct deploy_model_status + if(job_status == "COMPLETED"): + ti.xcom_push(key='deploy_model_status', value='DEPLOYED') + print('Model Deployed Successfully! Job ID is : ', deploy_job_id, ' Status : ', job_status) + else: + ti.xcom_push(key='deploy_model_status', value='FAILED') + print("Deployment Job is terminated !!") + + + +def retire_model(ti): + + deployed_model_status = ti.xcom_pull(task_ids = 'task_deploy_model', key = 'deploy_model_status') + + if deployed_model_status == 'FAILED': + ti.xcom_push(key='retire_model_status', value='FALIED') + print("Retirement cannot be done as the model is not deployed !!") + else: + trained_model_id = ti.xcom_pull(task_ids = 'task_train_model', key = 'trained_model_id') + + headers = { + 'AOA-Project-ID': config_main['projectid'], + 'Accept': 'application/json, text/plain, */*', + 'Accept-Language': 'en-US,en;q=0.9', + 'Authorization': 'Bearer ' + config_main['bearertoken'], + 'Content-Type': 'application/json', + } + + # Identifying the deployment ID + get_deployment_id_response = requests.get('https://airflow-u9usja4twtauvt3s.env.clearscape.teradata.com:8443/modelops/core/api/deployments/search/findByStatusAndTrainedModelId?projection=expandDeployment&status=DEPLOYED&trainedModelId=' + trained_model_id , headers=headers) + + get_deployment_id_json = get_deployment_id_response.json() + deployment_id = get_deployment_id_json['_embedded']['deployments'][0]['id'] + + json_data = { + "deploymentId": deployment_id + } + + # Retire the specific deployment + retire_model_response = requests.post('https://airflow-u9usja4twtauvt3s.env.clearscape.teradata.com:8443/modelops/core/api/trainedModels/' + trained_model_id + '/retire', headers=headers, json=json_data) + retire_model_response_json = retire_model_response.json() + + # Get the Evaluation Job ID + retire_job_id = retire_model_response_json.get('id') + ti.xcom_push(key='retire_job_id', value=retire_job_id) + + job_status = get_job_status(retire_job_id) + print("Started - Job - Status: ", job_status) + + while job_status != "COMPLETED": + if job_status=="ERROR": + print("The Retire job is terminated due to an Error") + # Set the Trained Model Id to None here and check in next step (Evaluate) + break + elif job_status=="CANCELLED": + print("The Retire job is Cancelled !!") + break + print("Job is not completed yet. Current status", job_status) + time.sleep(5) # wait 5s + job_status = get_job_status(retire_job_id) + + # Checking Job status at the end to push the correct evaluate_job_id + if(job_status == "COMPLETED"): + ti.xcom_push(key='retire_model_status', value='RETIRED') + print('Model Retired Successfully! Job ID is : ', retire_job_id, ' Status : ', job_status) + else: + ti.xcom_push(key='retire_model_status', value='FAILED') + print("Retire Job is terminated !!") + + + +with DAG( + dag_id = 'ModelOps_Accelerator_v1', + default_args=default_args, + description = 'ModelOps lifecycle accelerator for Python Diabetes Prediction model', + start_date=datetime.now(), # Set the start_date as per requirement + schedule_interval='@daily' +) as dag: + task1 = PythonOperator( + task_id='task_train_model', + python_callable=train_model + ) + task2 = PythonOperator( + task_id='task_evaluate_model', + python_callable=evaluate_model + ) + task3 = PythonOperator( + task_id='task_approve_model', + python_callable=approve_model + ) + task4 = PythonOperator( + task_id='task_deploy_model', + python_callable=deploy_model + ) + task5 = PythonOperator( + task_id='task_retire_model', + python_callable=retire_model + ) + + +task1.set_downstream(task2) +task2.set_downstream(task3) +task3.set_downstream(task4) +task4.set_downstream(task5) +``` +==== + +## Initialize Airflow in Docker Compose + +While initializing Airflow services like the internal Airflow database, for operating systems other than Linux, you may get a warning that AIRFLOW_UID is not set, but you can safely ignore it. by setting its environment variable using the following command. + +``` bash , id="UID Airflow variable", role="content-editable emits-gtm-events" +echo -e "AIRFLOW_UID=5000" > .env +``` + +To run internal database migrations and create the first user account, initialize the database using this command: + +``` bash , id="", role="content-editable emits-gtm-events" +docker compose up airflow-init +``` + +After initialization is complete, you should see a message like this: + +``` bash , id="Check Airflow init", role="content-editable emits-gtm-events" + airflow-init_1 | Upgrades done + airflow-init_1 | Admin user airflow created + airflow-init_1 | 2.8.2 + start_airflow-init_1 exited with code 0 +``` + +## Clean up Airflow demo environment + +You can clean up the environment which will remove the preloaded example DAGs using this command: + +``` bash , id="Docker compose down", role="content-editable emits-gtm-events" +docker-compose down -v +``` + +Then update this parameter in docker-compose.yaml file as given below: + +``` bash , id="Docker compose yaml", role="content-editable emits-gtm-events" +AIRFLOW__CORE__LOAD_EXAMPLES: 'false' +``` + +## Launch Airflow with Model Factory Solution Accelerator + +Launch Airflow using this command: + +``` bash , id="Docker compose up", role="content-editable emits-gtm-events" +docker-compose up -d +``` + + +## Run Airflow DAG of Model Factory Solution with ModelOps + +* Now you can access Airflow UI uisng the following http://localhost:8080/ + +![Airflow login](../modelops/images/execute-airflow-workflows-with-clearscape-analytics-modelops-model-factory-solution/LoginPage.png) + +* Login with Usename: airflow and Password: airflow. In the DAGs menu you will be able to see your created DAGs. + +![DAGs](../modelops/images/execute-airflow-workflows-with-clearscape-analytics-modelops-model-factory-solution/DAGs.png) + +* Select your latest created DAG and the graph will look like this: + +![DAGs](../modelops/images/execute-airflow-workflows-with-clearscape-analytics-modelops-model-factory-solution/DAG_graph.png) + +* Now you can trigger the DAG using the play icon on the top right side. + +* You can check the logs by selecting any task and then click on the logs menu: + +* On the ClearScape Analytics ModelOps - Jobs section you can see that the jobs have started running: + +![DAGs](../modelops/images/execute-airflow-workflows-with-clearscape-analytics-modelops-model-factory-solution/modelOps1.png) + +* Now you can see that all the tasks are successfully executed. + +![DAGs](../modelops/images/execute-airflow-workflows-with-clearscape-analytics-modelops-model-factory-solution/successTasks.png) + +## Summary + +This tutorial aimed at providing a hands on exercise on how to install an Airflow environment on a Linux server and how to use Airflow to interact with ClearScape Analytics ModelOps and Teradata Vantage database. An additional example is provided on how to integrate Airflow and the data modelling and maintenance tool dbt to create and load a Teradata Vantage database. + +## Further reading +* [ModelOps documentatioN](https://docs.teradata.com/search/documents?query=ModelOps&sort=last_update&virtual-field=title_only&content-lang=). diff --git a/quickstarts/analyze-data/integrate-teradata-jupyter-extensions-with-google-vertex-ai.md b/quickstarts/analyze-data/integrate-teradata-jupyter-extensions-with-google-vertex-ai.md new file mode 100644 index 0000000000..d29dca6768 --- /dev/null +++ b/quickstarts/analyze-data/integrate-teradata-jupyter-extensions-with-google-vertex-ai.md @@ -0,0 +1,180 @@ +--- +sidebar_position: 11 +author: Hailing Jiang +email: Hailing.Jiang@teradata.com +page_last_update: June 28th, 2022 +description: Integrate Teradata Jupyter extensions with Google Vertex AI +keywords: [data warehouses, compute storage separation, teradata, vantage, cloud data platform, business intelligence, enterprise analytics, jupyter, teradatasql, ipython-sql, teradatasqlalchemy] +--- +import ClearscapeDocsNote from '../_partials/vantage_clearscape_analytics.mdx' +import JupyterClearscapeNote from '../_partials/jupyter_notebook_clearscape_analytics_note.mdx'; + +# Integrate Teradata Jupyter extensions with Google Vertex AI + + + + +### Overview +Teradata Jupyter extensions provide Teradata SQL kernel and several UI extensions to allow users to easily access and navigate Teradata database from Jupyter envioronment. Google Vertex AI is Google Cloud's new unified ML platform. Vertex AI Workbench provides a Jupyter-base development environment for the entire data science workflow. This article describes how to integate our Jupyter extensions with Vertex AI Workbench so that Vertex AI users can take advantage of our Teradata extensions in their ML pipeline. + +Vertex AI workbench supports two types of notebooks: managed notebooks and user-managed notebooks. Here we will focus on user-managed notebooks. We will show two ways to integrate our Jupyter extensions with user-managed notebooks: use startup script to install our kernel and extensions or use custom container. + +### Prerequisites + + + +* Access to a Teradata Vantage instance + +* Google Cloud account with Vertex AI enabled +* Google cloud storage to store startup scripts and Teradata Jupyter extension package + +### Integration + +There are two ways to run Teradata Jupyter Extensions in Vertex AI: + +1. [Use startup script](#use_startup_script) +2. [Use custom container](#use_custom_container) + +These two integration methods are described below. + +### Use startup script + +When we create a new notebook instance, we can specify a startup script. This script runs only once after the instance is created. Here are the steps: + +1. Download Teradata Jupyter extensions package + +Go to [Vantage Modules for Jupyter](https://downloads.teradata.com/download/tools/vantage-modules-for-jupyter) page to download the Teradata Jupyter extensions package bundle Linux version. + +2. Upload the package to a Google Cloud storage bucket + +3. Write a startup script and upload it to cloud storage bucket + +Below is a sample script. It fetches Teradata Jupyter extension package from cloud storage bucket and installs Teradata SQL kernel and extensions. + +``` bash , role="content-editable" +#! /bin/bash + +cd /home/jupyter +mkdir teradata +cd teradata +gsutil cp gs://teradata-jupyter/* . +unzip teradatasql*.zip + +# Install Teradata kernel +cp teradatakernel /usr/local/bin + +jupyter kernelspec install ./teradatasql --prefix=/opt/conda + +# Install Teradata extensions +pip install --find-links . teradata_preferences_prebuilt +pip install --find-links . teradata_connection_manager_prebuilt +pip install --find-links . teradata_sqlhighlighter_prebuilt +pip install --find-links . teradata_resultset_renderer_prebuilt +pip install --find-links . teradata_database_explorer_prebuilt + +# PIP install the Teradata Python library +pip install teradataml + +# Install Teradata R library (optional, uncomment this line only if you use an environment that supports R) +#Rscript -e "install.packages('tdplyr',repos=c('https://r-repo.teradata.com','https://cloud.r-project.org'))" +``` + + +4. Create a new notebook and add the startup script from cloud storage bucket +![create a new notebook with startup script](../cloud-guides/images/integrate-teradata-jupyter-extensions-with-google-vertex-ai/vertex.create.notebook.startupscript.png) + +5. It may take a few minutes for the notebook creation process to complete. When it is done, click on `Open notebook`. +![Open notebook](../cloud-guides/images/integrate-teradata-jupyter-extensions-with-google-vertex-ai/vertex.open.notebook.png) + +### Use custom container + +Another option is to provide a custom container when creating a notebook. + +1. Download Teradata Jupyter extensions package + +Go to [Vantage Modules for Jupyter](https://downloads.teradata.com/download/tools/vantage-modules-for-jupyter) page to download the Teradata Jupyter extensions package bundle Linux version. + +2. Copy this package to your work directory and unzip it + +3. Build custom Docker image + +The custom container must expose a service on port 8080. It is recommended to create a container derived from a Google Deep Learning Containers image, because those images are already configured to be compatible with user-managed notebooks. + +Below is a sample Dockerfile you can use to build a Docker image with Teradata SQL kernel and extensions installed: + +``` bash +# Use one of the deep learning images as base image +# if you need both Python and R, use one of the R images +FROM gcr.io/deeplearning-platform-release/r-cpu:latest + +USER root + +############################################################## +# Install kernel and copy supporting files +############################################################## + +# Copy the kernel +COPY ./teradatakernel /usr/local/bin + +RUN chmod 755 /usr/local/bin/teradatakernel + +# Copy directory with kernel.json file into image +COPY ./teradatasql teradatasql/ + +# Copy notebooks and licenses +COPY ./notebooks/ /home/jupyter +COPY ./license.txt /home/jupyter +COPY ./ThirdPartyLicenses/ /home/jupyter + +# Install the kernel file to /opt/conda jupyter lab instance +RUN jupyter kernelspec install ./teradatasql --prefix=/opt/conda + +############################################################## +# Install Teradata extensions +############################################################## + +RUN pip install --find-links . teradata_preferences_prebuilt && \ + pip install --find-links . teradata_connection_manager_prebuilt && \ + pip install --find-links . teradata_sqlhighlighter_prebuilt && \ + pip install --find-links . teradata_resultset_renderer_prebuilt && \ + pip install --find-links . teradata_database_explorer_prebuilt + +# Give back ownership of /opt/conda to jovyan +RUN chown -R jupyter:users /opt/conda + +# PIP install the Teradata Python libraries +RUN pip install teradataml + +# Install Teradata R library (optional, include it only if you use a base image that supports R) +RUN Rscript -e "install.packages('tdplyr',repos=c('https://r-repo.teradata.com','https://cloud.r-project.org'))" +``` + +4. In your work directory (where you unzipped Teradata Jupyter extensions package), run `docker build` to build the image: + +``` bash , id="jupyterlab_ext_on_vertex_first_run", role="content-editable emits-gtm-events +docker build -f Dockerfile imagename:imagetag . +``` + +5. Push the docker image to Google container registry or artifact registry + +Please refer to the following documentations to push docker image to registry: + +* [Container Registry: Pushing and pulling images](https://cloud.google.com/container-registry/docs/pushing-and-pulling) +* [Artifact Registry: Pushing and pulling images](https://cloud.google.com/artifact-registry/docs/docker/pushing-and-pulling?hl=en) + +6. Create a new notebook + +In `Environment` section, set `custom container` field to the location of your newly created custom container: +![Open notebook](../cloud-guides/images/integrate-teradata-jupyter-extensions-with-google-vertex-ai/vertex.custom.container.png) + +### Further reading +* [Teradata Jupyter Extensions Website](https://teradata.github.io/jupyterextensions) +* [Teradata Vantage™ Modules for Jupyter Installation Guide](https://docs.teradata.com/r/KQLs1kPXZ02rGWaS9Ktoww/root) +* [Teradata® Package for Python User Guide](https://docs.teradata.com/r/1YKutX2ODdO9ppo_fnguTA/root) +* [Vertex AI documentation: Create a custom container image for training](https://cloud.google.com/vertex-ai/docs/training/create-custom-container) +* [Vertex AI documentation: Create a user-managed notebooks instance by using a custom container](https://cloud.google.com/vertex-ai/docs/workbench/user-managed/custom-container) +* [Vertex AI documentation: Create a user-managed notebooks instance](https://cloud.google.com/vertex-ai/docs/workbench/user-managed/create-new) + +import CommunityLinkPartial from '../_partials/community_link.mdx'; + + diff --git a/quickstarts/analyze-data/integrate-teradata-jupyter-extensions-with-sagemaker.md b/quickstarts/analyze-data/integrate-teradata-jupyter-extensions-with-sagemaker.md new file mode 100644 index 0000000000..553667599d --- /dev/null +++ b/quickstarts/analyze-data/integrate-teradata-jupyter-extensions-with-sagemaker.md @@ -0,0 +1,141 @@ +--- +sidebar_position: 11 +author: Hailing Jiang +email: Hailing.Jiang@teradata.com +page_last_update: September 27th, 2022 +description: Integrate Teradata Jupyter extensions with SageMaker notebook instance +keywords: [data warehouses, compute storage separation, teradata, vantage, cloud data platform, business intelligence, enterprise analytics, jupyter, teradatasql, ipython-sql, teradatasqlalchemy] +--- + +import ClearscapeDocsNote from '../_partials/vantage_clearscape_analytics.mdx' +import JupyterClearScapeNote from '../_partials/jupyter_notebook_clearscape_analytics_note.mdx'; + +# Integrate Teradata Jupyter extensions with SageMaker notebook instance + + + + +### Overview +Teradata Jupyter extensions provide Teradata SQL kernel and several UI extensions to allow users to easily asccess and navigate Teradata database from Jupyter envioronment. This article describes how to integate our Jupyter extensions with SageMaker notebook instance. + +### Prerequisites + + +* Access to a Teradata Vantage instance + +* AWS account +* AWS S3 bucket to store lifecycle configuration scripts and Teradata Jupyter extension package + +### Integration + +SageMaker supports customization of notebook instances using lifecycle configuration scripts. Below we will demo how to use lifecycle configuration scripts to install our Jupyter kernel and extensions in a notebook instance. + +### Steps to integrate with notebook instance + +1. Download Teradata Jupyter extensions package + +Download Linux version from https://downloads.teradata.com/download/tools/vantage-modules-for-jupyter and upload it to an S3 bucket. This zipped package contains Teradata Jupyter kernel and extensions. Each extension has 2 files, the one with "_prebuilt" in the name is prebuilt extension which can be installed using PIP, the other one is source extension that needs to be installed using "jupyter labextension". It is recommended to use prebuilt extensions. + +2. Create a lifecycle configuration for notebook instance +![create a lifecycle configuration for notebook instance](../cloud-guides/images/integrate-teradata-jupyter-extensions-with-sagemaker/sagemaker.notebook.create.lifecycle.config.png) + +Here are sample scripts that fetches the Teradata package from S3 bucket and installs Jupyter kernel and extensions. Note that on-create.sh creates a custom conda env that persists on notebook instance's EBS volume so that the installation will not get lost after notebook restarts. on-start.sh installs Teradata kernel and extensions to the custom conda env. + +on-create.sh + +``` bash , role="content-editable +#!/bin/bash + +set -e + +# This script installs a custom, persistent installation of conda on the Notebook Instance's EBS volume, and ensures +# that these custom environments are available as kernels in Jupyter. + + +sudo -u ec2-user -i <<'EOF' +unset SUDO_UID +# Install a separate conda installation via Miniconda +WORKING_DIR=/home/ec2-user/SageMaker/custom-miniconda +mkdir -p "$WORKING_DIR" +wget https://repo.anaconda.com/miniconda/Miniconda3-4.6.14-Linux-x86_64.sh -O "$WORKING_DIR/miniconda.sh" +bash "$WORKING_DIR/miniconda.sh" -b -u -p "$WORKING_DIR/miniconda" +rm -rf "$WORKING_DIR/miniconda.sh" +# Create a custom conda environment +source "$WORKING_DIR/miniconda/bin/activate" +KERNEL_NAME="teradatasql" + +PYTHON="3.8" +conda create --yes --name "$KERNEL_NAME" python="$PYTHON" +conda activate "$KERNEL_NAME" +pip install --quiet ipykernel + +EOF +``` + + +on-start.sh + +``` bash , role="content-editable" +#!/bin/bash + +set -e + +# This script installs Teradata Jupyter kernel and extensions. + + +sudo -u ec2-user -i <<'EOF' +unset SUDO_UID + +WORKING_DIR=/home/ec2-user/SageMaker/custom-miniconda + +source "$WORKING_DIR/miniconda/bin/activate" teradatasql + +# fetch Teradata Jupyter extensions package from S3 and unzip it +mkdir -p "$WORKING_DIR/teradata" +aws s3 cp s3://sagemaker-teradata-bucket/teradatasqllinux_3.3.0-ec06172022.zip "$WORKING_DIR/teradata" +cd "$WORKING_DIR/teradata" + +unzip -o teradatasqllinux_3.3.0-ec06172022.zip + +# install Teradata kernel +cp teradatakernel /home/ec2-user/anaconda3/condabin +jupyter kernelspec install --user ./teradatasql + +# install Teradata Jupyter extensions +source /home/ec2-user/anaconda3/bin/activate JupyterSystemEnv + +pip install teradata_connection_manager_prebuilt-3.3.0.tar.gz +pip install teradata_database_explorer_prebuilt-3.3.0.tar.gz +pip install teradata_preferences_prebuilt-3.3.0.tar.gz +pip install teradata_resultset_renderer_prebuilt-3.3.0.tar.gz +pip install teradata_sqlhighlighter_prebuilt-3.3.0.tar.gz + +conda deactivate +EOF +``` + +3. Create a notebook instance. Please select 'Amazon Linux 2, Jupyter Lab3' for Platform identifier and select the lifecycle configuration created in step 2 for Lifecycle configuration. + +![Create notebook instance](../cloud-guides/images/integrate-teradata-jupyter-extensions-with-sagemaker/sagemaker.notebook.create.notebook.instance.png) + +You might also need to add vpc, subnet and security group in 'Network' section to gain access to Teradata databases. + +4. Wait until notebook instance Status turns 'InService', click 'Open JupyterLab' to open the notebook. + +![Open notebook](../cloud-guides/images/integrate-teradata-jupyter-extensions-with-sagemaker/sagemaker.notebook.notebook.inservice.png) + + +Access the demo notebooks to get usage tips +![access demo notebooks](../cloud-guides/images/integrate-teradata-jupyter-extensions-with-sagemaker/sagemaker.notebook.start.png) + + +### Further reading +* [Teradata Jupyter Extensions Website](https://teradata.github.io/jupyterextensions) +* [Teradata Vantage™ Modules for Jupyter Installation Guide](https://docs.teradata.com/r/KQLs1kPXZ02rGWaS9Ktoww/root) +* [Teradata® Package for Python User Guide](https://docs.teradata.com/r/1YKutX2ODdO9ppo_fnguTA/root) +* [Customize a Notebook Instance Using a Lifecycle Configuration Script](https://docs.aws.amazon.com/sagemaker/latest/dg/notebook-lifecycle-config.html) +* [amazon sagemaker notebook instance lifecycle config samples](https://github.com/aws-samples/amazon-sagemaker-notebook-instance-lifecycle-config-samples/blob/master/scripts/persistent-conda-ebs/on-create.sh) + +import CommunityLinkPartial from '../_partials/community_link.mdx'; + + diff --git a/quickstarts/analyze-data/integrate-teradata-vantage-with-knime.md b/quickstarts/analyze-data/integrate-teradata-vantage-with-knime.md new file mode 100644 index 0000000000..143ebf2a51 --- /dev/null +++ b/quickstarts/analyze-data/integrate-teradata-vantage-with-knime.md @@ -0,0 +1,64 @@ +--- +sidebar_position: 13 +author: Jeremy Yu +email: Jeremy.yu@teradata.com +page_last_update: May 18th, 2022 +description: Integrate Teradata Vantage with KNIME. +keywords: [data warehouses, compute storage separation, teradata, vantage, cloud data platform, object storage, business intelligence, enterprise analytics] +--- + +# Integrate Teradata Vantage with KNIME Analytics Platform + +## Overview + +This how-to describes how to connect to Terdata Vantage from KNIME Analytics Platform. + +### About KNIME Analytics Platform + +KNIME Analytics Platform is a data science workbench. It supports analytics on various data sources, including Teradata Vantage. + +## Prerequisites + + +import ClearscapeDocsNote from '../_partials/vantage_clearscape_analytics.mdx' + +* Access to a Teradata Vantage instance, version 17.10 or higher. + +* KNIME installed locally. See [KNIME installation instructions](https://www.knime.com/installation) for details. + +## Integration Procedure + +1. Go to https://downloads.teradata.com/download/connectivity/jdbc-driver (first time users will need to register) and download the latest version of the JDBC driver. +2. Unzip the downloaded file. You will find `terajdbc4.jar` file. +3. In KNIME, click on `File → Preference`. Under `Databases`, click `Add`: +![Add jar](../other-integrations/images/integrate-teradata-vantage-with-knime/add-jar.png) +4. Register a new database driver. Provide values for `ID`, `Name` and `Description` like below. Click on `Add file` and point to the .jar file you downloaded earlier. Click on the `Find driver classes` and the `Driver class:` should populate with the `jdbc.TeraDriver`: +![Register driver](../other-integrations/images/integrate-teradata-vantage-with-knime/register-driver.png) +5. Click `Apply and Close`: +![Apply and close](../other-integrations/images/integrate-teradata-vantage-with-knime/apply-and-close.png) +6. To test the connection, create a new KNIME workflow and add a `Database Reader (legacy)` node by dragging it to the workspace to the right: +![Test connection step 1](../other-integrations/images/integrate-teradata-vantage-with-knime/test-connection-1.png) +![Test connection step 2](../other-integrations/images/integrate-teradata-vantage-with-knime/test-connection-2.png) +7. Right-click on the `Database Reader (legacy)` to configure settings. Select `com.teradata.jdbc.Teradriver` from the drop-down: +![Start configuration](../other-integrations/images/integrate-teradata-vantage-with-knime/start-configuration.png) +8. Enter the name of the Vantage server and login mechanism, e.g.: +![Enter configuration](../other-integrations/images/integrate-teradata-vantage-with-knime/enter-configuration.png) +9. To test connection, enter SQL statement in box in lower right. For example, enter `SELECT * FROM DBC.DBCInfoV` and click `Apply` to close the dialog: +![Test connection apply](../other-integrations/images/integrate-teradata-vantage-with-knime/test-connection-apply.png) +10. Execute the node to test the connection: +![Execute node](../other-integrations/images/integrate-teradata-vantage-with-knime/execute-node.png) +11. The node will show a green light when run successfully. Right-click and select `Data from Database` to view the results: +![View results](../other-integrations/images/integrate-teradata-vantage-with-knime/view-results.png) +![View results](../other-integrations/images/integrate-teradata-vantage-with-knime/view-results-final.png) + + +## Summary + +This how-to demonstrats how to connect from KNIME Analytics Platform to Teradata Vantage. + +## Further reading +* [Train ML models in Vantage using only SQL](https://quickstarts.teradata.com/ml.html) + +import CommunityLinkPartial from '../_partials/community_link.mdx'; + + \ No newline at end of file diff --git a/quickstarts/analyze-data/jupyter.md b/quickstarts/analyze-data/jupyter.md new file mode 100644 index 0000000000..8bcc9ba4d8 --- /dev/null +++ b/quickstarts/analyze-data/jupyter.md @@ -0,0 +1,155 @@ +--- +sidebar_position: 1 +id: jupyter +title: Run Vantage Express on UTM +author: Adam Tworkiewicz +email: adam.tworkiewicz@teradata.com +page_last_update: November 10th, 2022 +description: Use Teradata Vantage from a Jupyter notebook +keywords: [data warehouses, compute storage separation, teradata, vantage, cloud data platform, JDBC, java applications, business intelligence, enterprise analytics, jupyter, teradatasql, ipython-sql, teradatasqlalchemy] +--- + +import ClearscapeJyupiter from '../_partials/jupyter_notebook_clearscape_analytics_note.mdx' +import ClearscapeDocsNote from '../_partials/vantage_clearscape_analytics.mdx' +import CommunityLink from '../_partials/community_link.mdx' + +# Use Vantage from a Jupyter notebook + + + +## Overview +In this how-to we will go through the steps for connecting to Teradata Vantage from a Jupyter notebook. + + + +## Options + +There are a couple of ways to connect to Vantage from a Jupyter Notebook: + +1. [Use python or R libraries in a regular Python/R kernel notebook](#teradata-libraries) - this option works well when you are in a restricted environment that doesn't allow you to spawn your own Docker images. Also, it's useful in traditional datascience scenarios when you have to mix SQL and Python/R in a notebook. If you are proficient with Jupyter and have your own set of preferred libraries and extensions, start with this option. +2. [Use the Teradata Jupyter Docker image](#teradata-jupyter-docker-image) - the Teradata Jupyter Docker image bundles the Teradata SQL kernel (more on this later), `teradataml` and `tdplyr` libraries, python and R drivers. It also contains Jupyter extensions that allow you to manage Teradata connections, explore objects in Vantage database. It's convenient when you work a lot with SQL or would find a visual Navigator helpful. If you are new to Jupyter or if you prefer to get a currated assembly of libraries and extensions, start with this option. + +### Teradata libraries + +This option uses a regular Jupyter Lab notebook. We will see how to load the Teradata Python driver and use it from Python code. We will also examine `ipython-sql` extension that adds support for SQL-only cells. + +1. We start with a plain Jupyter Lab notebook. Here, I'm using docker but any method of starting a notebook, including Jupyter Hub, Google Cloud AI Platform Notebooks, AWS SageMaker Notebooks, Azure ML Notebooks will do. + +``` +docker run --rm -p 8888:8888 -e JUPYTER_ENABLE_LAB=yes \ + -v "${PWD}":/home/jovyan/work jupyter/datascience-notebook +``` + +2. Docker logs will display the url that you need to go to: +``` +Entered start.sh with args: jupyter lab +Executing the command: jupyter lab +.... +To access the server, open this file in a browser: + file:///home/jovyan/.local/share/jupyter/runtime/jpserver-7-open.html +Or copy and paste one of these URLs: + http://d5c2323ae5db:8888/lab?token=5fb43e674367c6895e8c2404188aa550b5c7bdf96f5b4a3a + or http://127.0.0.1:8888/lab?token=5fb43e674367c6895e8c2404188aa550b5c7bdf96f5b4a3a +``` + +3. We will open a new notebook and create a cell to install the required libraries: +:::note +I've published a notebook with all the cells described below on GitHub: https://github.com/Teradata/quickstarts/blob/main/modules/ROOT/attachments/vantage-with-python-libraries.ipynb + +``` +import sys +!{sys.executable} -m pip install teradatasqlalchemy +``` + +4. Now, we will import `Pandas` and define the connection string to connect to Teradata. Since I'm running my notebook in Docker on my local machine and I want to connect to a local Vantage Express VM, I'm using `host.docker.internal` DNS name provided by Docker to reference the IP of my machine. + +``` +import pandas as pd +# Define the db connection string. Pandas uses SQLAlchemy connection strings. +# For Teradata Vantage, it's teradatasql://username:password@host/database_name . +# See https://pypi.org/project/teradatasqlalchemy/ for details. +db_connection_string = "teradatasql://dbc:dbc@host.docker.internal/dbc" +``` + +5. I can now call Pandas to query Vantage and move the result to a Pandas dataframe: +``` +pd.read_sql("SELECT * FROM dbc.dbcinfo", con = db_connection_string) +``` + +6. The syntax above is concise but it can get tedious if all you need is to explore data in Vantage. We will use `ipython-sql` and its `%%sql` magic to create SQL-only cells. We start with importing the required libraries. +``` +import sys +!{sys.executable} -m pip install ipython-sql teradatasqlalchemy +``` + +7. We load `ipython-sql` and define the db connection string: +``` +%load_ext sql +# Define the db connection string. The sql magic uses SQLAlchemy connection strings. +# For Teradata Vantage, it's teradatasql://username:password@host/database_name . +# See https://pypi.org/project/teradatasqlalchemy/ for details. +%sql teradatasql://dbc:dbc@host.docker.internal/dbc +``` + +8. We can now use `%sql` and `%%sql` magic. Let's say we want to explore data in a table. We can create a cell that says: +``` +%%sql +SELECT * FROM dbc.dbcinfo +``` + +9. If we want to move the data to a Pandas frame, we can say: +``` +result = %sql SELECT * FROM dbc.dbcinfo +result.DataFrame() +``` + +There are many other features that ipython-sql provides, including variable substitution, plotting with `matplotlib`, writting results to a local csv file or back to the database. See [the demo notebook](https://github.com/Teradata/quickstarts/blob/main/modules/ROOT/attachments/vantage-with-python-libraries.ipynb) for examples and [ipython-sql github repo](https://github.com/catherinedevlin/ipython-sql/) for a complete reference. + +### Teradata Jupyter Docker image + +The Teradata Jupyter Docker image builds on `jupyter/datascience-notebook` Docker image. It adds the Teradata SQL kernel, Teradata Python and R libraries, Jupyter extensions to make you productive while interacting with Teradata Vantage. The image also contains sample notebooks that demonstrate how to use the SQL kernel and Teradata libraries. + +The SQL kernel and Teradata Jupyter extensions are useful for people that spend a lot of time with the SQL interface. Think about it as a notebook experience that, in many cases, is more convenient than using Teradata Studio. The Teradata Jupyter Docker image doesn't try to replace Teradata Studio. It doesn't have all the features. It's designed for people who need a lightweight, web-based interface and enjoy the notebook UI. + +The Teradata Jupyter Docker image can be used when you want to run Jupyter locally or you have a place where you can run custom Jupyter docker images. The steps below demonstrate how to use the image locally. + +1. Run the image: + +:::note +By passing `-e "accept_license=Y` you accept [the license agreement](https://github.com/Teradata/jupyterextensions/blob/master/licensefiles/license.txt) for Teradata Jupyter Extensions. +::: + +``` +docker volume create notebooks +docker run -e "accept_license=Y" -p :8888:8888 \ + -v notebooks:/home/jovyan/JupyterLabRoot \ + teradata/jupyterlab-extensions +``` + +2. Docker logs will display the url that you need to go to. For example, this is what I've got: +``` +Starting JupyterLab ... +Docker Build ID = 3.2.0-ec02012022 +Using unencrypted HTTP + +Enter this URL in your browser: http://localhost:8888?token=96a3ab874a03779c400966bf492fe270c2221cdcc74b61ed + +* Or enter this token when prompted by Jupyter: 96a3ab874a03779c400966bf492fe270c2221cdcc74b61ed +* If you used a different port to run your Docker, replace 8888 with your port number +``` + +3. Open up the URL and use the file explorer to open the following notebook: `jupyterextensions -> notebooks -> sql -> GettingStartedDemo.ipynb`. +4. Go through the demo of the Teradata SQL Kernel: + +![GettingStartedDemo.ipynb ](../images/gettingstarteddemo.ipynb.png) + +## Summary + +This quick start covered different options to connect to Teradata Vantage from a Jupyter Notebook. We learned about the Teradata Jupyter Docker image that bundles multiple Teradata Python and R libraries. It also provides an SQL kernel, database object explorer and connection management. These features are useful when you spend a lot of time with the SQL interface. For more traditional data science scenarios, we explored the standalone Teradata Python driver and integration through the ipython sql extension. + +## Further reading +* [Teradata Jupyter Extensions Website](https://teradata.github.io/jupyterextensions) +* [Teradata Vantage™ Modules for Jupyter Installation Guide](https://docs.teradata.com/r/KQLs1kPXZ02rGWaS9Ktoww/root) +* [Teradata® Package for Python User Guide](https://docs.teradata.com/r/1YKutX2ODdO9ppo_fnguTA/root) + + \ No newline at end of file diff --git a/quickstarts/analyze-data/local-jupyter-hub.md b/quickstarts/analyze-data/local-jupyter-hub.md new file mode 100644 index 0000000000..d90392de36 --- /dev/null +++ b/quickstarts/analyze-data/local-jupyter-hub.md @@ -0,0 +1,171 @@ +--- +sidebar_position: 2 +id: local-jupyter-hub +title: Deploy Teradata Jupyter extensions to JupyterHub +author: Hailing Jiang +email: Hailing.iang@teradata.com +page_last_update: November 17th, 2021 +description: Deploy Teradata Jupyter extensions in customer JupyterHub clusters +keywords: [data warehouses, compute storage separation, teradata, vantage, cloud data platform, java applications, business intelligence, enterprise analytics, jupyter, teradatasql, ipython-sql, teradatasqlalchemy] +--- + +import ClearscapeDocsNote from '../_partials/vantage_clearscape_analytics.mdx' +import CommunityLink from '../_partials/community_link.mdx' + +# Deploy Teradata Jupyter extensions to JupyterHub + +## Overview + +For customers who have their own JupyterHub clusters, there are two options to integrate Teradata Jupyter extensions into the existing clusters: + +1. Use Teradata Jupyter Docker image. +2. Customize an existing Docker image to include Teradata extensions. + +This page contains detailed instructions on the two options. Instructions are based on the assumption that the customer JupyterHub deployment is based on [Zero to JupyterHub with Kubernetes](https://zero-to-jupyterhub.readthedocs.io/en/latest/index.html). + + + +## Use Teradata Jupyter Docker image + +Teradata provides a ready-to-run Docker image that builds on the [jupyter/datascience-notebook](https://hub.docker.com/r/jupyter/datascience-notebook/) image. It bundles the Teradata SQL kernel, Teradata Python and R libraries and drivers and Teradata extensions for Jupyter to make you productive while interacting with Teradata database. The image also contains sample notebooks that demonstrate how to use the SQL kernel, extensions and Teradata libraries. + +You can use this image in the following ways: + +* Start a personal Jupyter Notebook server in a local Docker container +* Run JupyterLab servers for a team using JupyterHub + +For instructions to start a personal JupyterLab server in a local Docker container, please see [installation guide](https://docs.teradata.com/r/KQLs1kPXZ02rGWaS9Ktoww/Fwvns7y_a7juDWx1NixC2A). This section will focus on how to use the  Teradata Jupyter Docker image in a customer's existing JupyterHub environment. + +### Install Teradata Jupyter Docker image in your registry + +1. Go to [Vantage Modules for Jupyter](https://downloads.teradata.com/download/tools/vantage-modules-for-jupyter) page and download the Docker image. It is a tarball with name in this format `teradatajupyterlabext_VERSION.tar.gz`. + +2. Load the image: +``` +docker load -i teradatajupyterlabext_VERSION.tar.gz +``` + +3. Push the image to your Docker registry: +``` +docker push +``` + +:::tip +You may want to consider changing the name of the loaded image for simplicity: + +``` +docker tag OLD_IMAGE_NAME NEW_IMAGE_NAME +``` +::: + +### Use Teradata Jupyter Docker image in JupyterHub + +1. To use the Teradata Jupyter Docker image directly in your JupyterHub cluster, modify the override file as described in [herein the JupyterHub documentation](https://zero-to-jupyterhub.readthedocs.io/en/latest/jupyterhub/customizing/user-environment.html#choose-and-use-an-existing-docker-image). Replace `REGISTRY_URL` and `VERSION` with appropriate values from the step above: + +``` +singleuser: + image: + name: REGISTRY_URL/teradatajupyterlabext_VERSION + tag: latest +``` + +2. Apply the changes to the cluster as described in [JupyterHub documentation](https://zero-to-jupyterhub.readthedocs.io/en/latest/jupyterhub/customizing/extending-jupyterhub.html#applying-configuration-changes). + +:::tip +You can use multiple profiles to allow users to select which image they want to use when they log in to JupyterHub. For detailed instructions and examples on configuring multiple profiles, please see [JupyterHub documentation](https://zero-to-jupyterhub.readthedocs.io/en/latest/jupyterhub/customizing/user-environment.html#using-multiple-profiles-to-let-users-select-their-environment). + +### Customize Teradata Jupyter Docker image + +If your users need some packages or notebooks that are not bundled in the Teradata Jupyter Docker image, we recommend that you use Teradata image as a base image and build a new one on top of it. + +Here is an example Dockerfile that builds on top of Teradata image and adds additional packages and notebooks. Use the Dockerfile to build a new Docker image, push the image to a designated registry, modify override file as shown above to use the new image as singleuser image, apply the changes to the cluster as described above. Replace `REGISTRY_URL` and `VERSION` with appropriate values: + +``` +FROM REGISTRY_URL/teradatajupyterlabext_VERSION:latest + +# install additional packages +RUN pip install --no-cache-dir astropy + +# copy notebooks +COPY notebooks/. /tmp/JupyterLabRoot/DemoNotebooks/ +``` + +## Customize an existing Docker image to include Teradata extensions + +If you prefer, you can include the Teradata SQL kernel and extensions into into an existing image you are currently using. + +1. Go to [Vantage Modules for Jupyter](https://downloads.teradata.com/download/tools/vantage-modules-for-jupyter) page to download the zipped Teradata Jupyter extensions package bundle.  Assuming your existing docker image is Linux based, you will want to use the Linux version of the download.  Otherwise, download for the platform you are using. The `.zip` file contains the Teradata SQL Kernel, extensions and sample notebooks. +2. Unzip the bundle file to your working directory. +3. Below is an example Dockerfile to add Teradata Jupyter extensions to your existing Docker image. Use the Dockerfile to build a new Docker image, push the image to a designated registry, modify override file as shown above to use the new image as singleuser image, apply the changes to the cluster: + +``` +FROM REGISTRY_URL/your-existing-image:tag +ENV NB_USER=jovyan \ + HOME=/home/jovyan \ + EXT_DIR=/opt/teradata/jupyterext/packages + +USER root + +############################################################## +# Install kernel and copy supporting files +############################################################## + +# Copy the kernel +COPY ./teradatakernel /usr/local/bin +RUN chmod 755 /usr/local/bin/teradatakernel + +# Copy directory with kernel.json file into image +COPY ./teradatasql teradatasql/ + +############################################################## +# Switch to user jovyan to copy the notebooks and license files. +############################################################## + +USER $NB_USER + +# Copy notebooks +COPY ./notebooks/ /tmp/JupyterLabRoot/TeradataSampleNotebooks/ + +# Copy license files +COPY ./ThirdPartyLicenses /tmp/JupyterLabRoot/ThirdPartyLicenses/ + +USER root + +# Install the kernel file to /opt/conda jupyter lab instance +RUN jupyter kernelspec install ./teradatasql --prefix=/opt/conda + +############################################################## +# Install Teradata extensions +############################################################## + +COPY ./teradata_*.tgz $EXT_DIR + +WORKDIR $EXT_DIR + +RUN jupyter labextension install --no-build teradata_database* && \ + jupyter labextension install --no-build teradata_resultset* && \ + jupyter labextension install --no-build teradata_sqlhighlighter* && \ + jupyter labextension install --no-build teradata_connection_manager* && \ + jupyter labextension install --no-build teradata_preferences* && \ + jupyter lab build --dev-build=False --minimize=False && \ + rm -rf * + +WORKDIR $HOME + +# Give back ownership of /opt/conda to jovyan +RUN chown -R jovyan:users /opt/conda + +# Jupyter will create .local directory +RUN rm -rf $HOME/.local +``` + +4. You can optionally install Teradata package for Python and Teradata package for R. See the following pages for details: +* [Teradata Package for Python - teradataml download page](https://downloads.teradata.com/download/aster/teradata-python-package-teradataml) +* [Teradata Package for R - tdplyr download page](https://downloads.teradata.com/download/aster/tdplyr-download-page) + +## Further reading +* [Teradata Jupyter Extensions Website](https://teradata.github.io/jupyterextensions) +* [Teradata Vantage™ Modules for Jupyter Installation Guide](https://docs.teradata.com/r/KQLs1kPXZ02rGWaS9Ktoww/root) +* [Teradata® Package for Python User Guide](https://docs.teradata.com/r/1YKutX2ODdO9ppo_fnguTA/root) + + diff --git a/quickstarts/analyze-data/ml.md b/quickstarts/analyze-data/ml.md new file mode 100644 index 0000000000..7887a5fe5c --- /dev/null +++ b/quickstarts/analyze-data/ml.md @@ -0,0 +1,270 @@ +--- +sidebar_position: 3 +id: ml +title: Train ML models in Vantage using Database Analytic Functions +author: Krutik Pathak +email: krutik.pathak@teradata.com +page_last_update: November 21st, 2023 +description: Train an ML model without leaving Teradata Vantage - use Vantage Database Analytic Functions to create ML models. +keywords: [data warehouses, database analytic functions, compute storage separation, teradata, vantage, cloud data platform, object storage, business intelligence, enterprise analytics, AI/ML] +--- + +import ClearscapeDocsNote from '../_partials/vantage_clearscape_analytics.mdx' +import CommunityLink from '../_partials/community_link.mdx' + +# Train ML models in Vantage using Database Analytic Functions + +## Overview + +There are situations when you want to quickly validate a machine learning model idea. You have a model type in mind. You don't want to operationalize with an ML pipeline just yet. You just want to test out if the relationship you had in mind exists. Also, sometimes even your production deployment doesn't require constant relearning with MLops. In such cases, you can use Database Analytic Functions for feature engineering, train different ML models, score your models, and evaluate your model on different model evaluation functions. + +## Prerequisites + +You need access to a Teradata Vantage instance. + + +## Load the sample data + +Here in this example we will be using the sample data from `val` database. We will use the `accounts`, `customer`, and `transactions` tables. We will be creating some tables in the process and you might face some issues while creating tables in `val` database, so let's create our own database `td_analytics_functions_demo`. + +``` +CREATE DATABASE td_analytics_functions_demo +AS PERMANENT = 110e6; +``` + +:::note +You must have CREATE TABLE permissions on the Database where you want to use Database Analytics Functions. +::: + +Let's now create `accounts`, `customer` and `transactions` tables in our database `td_analytics_functions_demo` from the corresponding tables in `val` database. + +``` +DATABASE td_analytics_functions_demo; + +CREATE TABLE customer AS ( +SELECT * FROM val.customer +) WITH DATA; + +CREATE TABLE accounts AS ( +SELECT * FROM val.accounts +) WITH DATA; + +CREATE TABLE transactions AS ( +SELECT * FROM val.transactions +) WITH DATA; +``` + +## Understand the sample data + +Now, that we have our sample tables loaded into `td_analytics_functions_demo`, let's explore the data. It's a simplistic, fictitious dataset of banking customers (700-ish rows), Accounts (1400-ish rows) and Transactions (77K-ish rows). They are related to each other in the following ways: + +![Banking Model](../images/banking.model.png) + +In later parts of this how-to we are going to explore if we can build a model that predicts average monthly balance that a banking customer has on their credit card based on all non-credit card related variables in the tables. + +## Preparing the Dataset + +We have data in three different tables that we want to join and create features. Let's start by creating a joined table. + +``` +-- Create a consolidated joined_table from customer, accounts and transactions table +CREATE TABLE td_analytics_functions_demo.joined_table AS ( + SELECT + T1.cust_id AS cust_id + ,MIN(T1.income) AS tot_income + ,MIN(T1.age) AS tot_age + ,MIN(T1.years_with_bank) AS tot_cust_years + ,MIN(T1.nbr_children) AS tot_children + ,MIN(T1.marital_status)AS marital_status + ,MIN(T1.gender) AS gender + ,MAX(T1.state_code) AS state_code + ,AVG(CASE WHEN T2.acct_type = 'CK' THEN T2.starting_balance+T2.ending_balance ELSE 0 END) AS ck_avg_bal + ,AVG(CASE WHEN T2.acct_type = 'SV' THEN T2.starting_balance+T2.ending_balance ELSE 0 END) AS sv_avg_bal + ,AVG(CASE WHEN T2.acct_type = 'CC' THEN T2.starting_balance+T2.ending_balance ELSE 0 END) AS cc_avg_bal + ,AVG(CASE WHEN T2.acct_type = 'CK' THEN T3.principal_amt+T3.interest_amt ELSE 0 END) AS ck_avg_tran_amt + ,AVG(CASE WHEN T2.acct_type = 'SV' THEN T3.principal_amt+T3.interest_amt ELSE 0 END) AS sv_avg_tran_amt + ,AVG(CASE WHEN T2.acct_type = 'CC' THEN T3.principal_amt+T3.interest_amt ELSE 0 END) AS cc_avg_tran_amt + ,COUNT(CASE WHEN ((EXTRACT(MONTH FROM T3.tran_date) + 2) / 3) = 1 THEN T3.tran_id ELSE NULL END) AS q1_trans_cnt + ,COUNT(CASE WHEN ((EXTRACT(MONTH FROM T3.tran_date) + 2) / 3) = 2 THEN T3.tran_id ELSE NULL END) AS q2_trans_cnt + ,COUNT(CASE WHEN ((EXTRACT(MONTH FROM T3.tran_date) + 2) / 3) = 3 THEN T3.tran_id ELSE NULL END) AS q3_trans_cnt + ,COUNT(CASE WHEN ((EXTRACT(MONTH FROM T3.tran_date) + 2) / 3) = 4 THEN T3.tran_id ELSE NULL END) AS q4_trans_cnt + FROM Customer AS T1 + LEFT OUTER JOIN Accounts AS T2 + ON T1.cust_id = T2.cust_id + LEFT OUTER JOIN Transactions AS T3 + ON T2.acct_nbr = T3.acct_nbr +GROUP BY T1.cust_id) WITH DATA UNIQUE PRIMARY INDEX (cust_id); +``` + +Let's now see how our data looks. The dataset has both categorical and continuous features or independent variables. In our case, the dependent variable is `cc_avg_bal` which is customer's average credit card balance. + +![Joined Table](../images/joined_table_ml.png) + +## Feature Engineering + +On looking at the data we see that there are several features that we can take into consideration for predicting the `cc_avg_bal`. + +### TD_OneHotEncodingFit + +As we have some categorical features in our dataset such as `gender`, `marital status` and `state code`. We will leverage the Database Analytics function [TD_OneHotEncodingFit](https://docs.teradata.com/r/Enterprise_IntelliFlex_VMware/Database-Analytic-Functions/Feature-Engineering-Transform-Functions/TD_OneHotEncodingFit) to encode categories to one-hot numeric vectors. + +``` +CREATE VIEW td_analytics_functions_demo.one_hot_encoding_joined_table_input AS ( + SELECT * FROM TD_OneHotEncodingFit( + ON td_analytics_functions_demo.joined_table AS InputTable + USING + IsInputDense ('true') + TargetColumn ('gender','marital_status','state_code') + CategoryCounts(2,4,33) +Approach('Auto') +) AS dt +); +``` + +### TD_ScaleFit + +If we look at the data, some columns like `tot_income`, `tot_age`, `ck_avg_bal` have values in different ranges. For the optimization algorithms like gradient descent it is important to normalize the values to the same scale for faster convergence, scale consistency and enhanced model performance. We will leverage [TD_ScaleFit](https://docs.teradata.com/r/Enterprise_IntelliFlex_VMware/Database-Analytic-Functions/Feature-Engineering-Transform-Functions/TD_ScaleFit) function to normalize values in different scales. + +``` + CREATE VIEW td_analytics_functions_demo.scale_fit_joined_table_input AS ( + SELECT * FROM TD_ScaleFit( + ON td_analytics_functions_demo.joined_table AS InputTable + USING + TargetColumns('tot_income','q1_trans_cnt','q2_trans_cnt','q3_trans_cnt','q4_trans_cnt','ck_avg_bal','sv_avg_bal','ck_avg_tran_amt', 'sv_avg_tran_amt', 'cc_avg_tran_amt') + ScaleMethod('RANGE') +) AS dt +); +``` + +### TD_ColumnTransformer + +Teradata's Database Analytic Functions typically operate in pairs for data transformations. The first step is dedicated to "fitting" the data. Subsequently, the second function utilizes the parameters derived from the fitting process to execute the actual transformation on the data. The [TD_ColumnTransformer](https://docs.teradata.com/r/Enterprise_IntelliFlex_VMware/Database-Analytic-Functions/Feature-Engineering-Transform-Functions/TD_ColumnTransformer)takes the FIT tables to the function and transforms the input table columns in single operation. + +``` +-- Using a consolidated transform function +CREATE TABLE td_analytics_functions_demo.feature_enriched_accounts_consolidated AS ( +SELECT * FROM TD_ColumnTransformer( +ON joined_table AS InputTable +ON one_hot_encoding_joined_table_input AS OneHotEncodingFitTable DIMENSION +ON scale_fit_joined_table_input AS ScaleFitTable DIMENSION +) as dt +) WITH DATA; +``` + +Once we perform the transformation we can see our categorical columns one-hot encoded and numeric values scaled as can be seen in the image below. For ex: `tot_income` is in the range [0,1], `gender` is one-hot encoded to `gender_0`, `gender_1`, `gender_other`. + +![Total Income Scaled](../images/ml_tot_income_scaled.png) + +![Gender One Hot Encoded](../images/ml_gender_hot_encoded.png) + + +## Train Test Split + +As we have our datatset ready with features scaled and encoded, now let's split our dataset into training (75%) and testing (25%) parts. Teradata's Database Analytic Functions provide [TD_TrainTestSplit](https://docs.teradata.com/r/Enterprise_IntelliFlex_VMware/Database-Analytic-Functions/Model-Evaluation-Functions/TD_TrainTestSplit) function that we'll leverage to split our dataset. + +``` +-- Train Test Split on Input table +CREATE VIEW td_analytics_functions_demo.train_test_split AS ( +SELECT * FROM TD_TrainTestSplit( +ON td_analytics_functions_demo.feature_enriched_accounts_consolidated AS InputTable +USING +IDColumn('cust_id') +trainSize(0.75) +testSize(0.25) +Seed (42) +) AS dt +); +``` + +As can be seen in the image below, the function adds a new column `TD_IsTrainRow`. + +![Train Row Column](../images/ml_train_col.png) + +We'll use `TD_IsTrainRow` to create two tables, one for training and other for testing. + +``` +-- Creating Training Table +CREATE TABLE td_analytics_functions_demo.training_table AS ( + SELECT * FROM td_analytics_functions_demo.train_test_split + WHERE TD_IsTrainRow = 1 +) WITH DATA; + +-- Creating Testing Table +CREATE TABLE td_analytics_functions_demo.testing_table AS ( + SELECT * FROM td_analytics_functions_demo.train_test_split + WHERE TD_IsTrainRow = 0 +) WITH DATA; +``` + +## Training with Generalized Linear Model + +We will now use [TD_GLM](https://docs.teradata.com/r/Enterprise_IntelliFlex_VMware/Database-Analytic-Functions/Model-Training-Functions/TD_GLM) Database Analytic Function to train on our training dataset. The `TD_GLM` function is a generalized linear model (GLM) that performs regression and classification analysis on data sets. Here we have used a bunch of input columns such as `tot_income`, `ck_avg_bal`,`cc_avg_tran_amt`, one-hot encoded values for marital status, gender and states. `cc_avg_bal` is our dependent or response column which is continous and hence is a regression problem. We use `Family` as `Gaussian` for regression and `Binomial` for classification. + +The parameter `Tolerance` signifies minimum improvement required in prediction accuracy for model to stop the iterations and `MaxIterNum` signifies the maximum number of iterations allowed. The model concludes training upon whichever condition is met first. For example in the example below the model is `CONVERGED` after 58 iterations. + +``` +-- Training the GLM_Model with Training Dataset +CREATE TABLE td_analytics_functions_demo.GLM_model_training AS ( +SELECT * FROM TD_GLM ( + ON td_analytics_functions_demo.training_table AS InputTable + USING + InputColumns('tot_income','ck_avg_bal','cc_avg_tran_amt','[19:26]') + ResponseColumn('cc_avg_bal') + Family ('Gaussian') + MaxIterNum (300) + Tolerance (0.001) + Intercept ('true') +) AS dt +) WITH DATA; +``` + +![TTrained GLM](../images/ml_model_trained.png) + +== Scoring on Testing Dataset + +We will now use our model `GLM_model_training` to score our testing dataset `testing_table` using link:[TD_GLMPredict](https://docs.teradata.com/r/Enterprise_IntelliFlex_VMware/Database-Analytic-Functions/Model-Scoring-Functions/TD_GLMPredict)Database Analytic Function. + +``` +-- Scoring the GLM_Model with Testing Dataset +CREATE TABLE td_analytics_functions_demo.GLM_model_test_prediction AS ( +SELECT * from TD_GLMPredict ( +ON td_analytics_functions_demo.testing_table AS InputTable +ON td_analytics_functions_demo.GLM_model_training AS ModelTable DIMENSION +USING +IDColumn ('cust_id') +Accumulate('cc_avg_bal') +) AS dt +) WITH DATA; +``` + +![Scored GLM](../images/ml_model_scored.png) + +## Model Evaluation + +Finally, we evaluate our model on the scored results. Here we are using [TD_RegressionEvaluator](https://docs.teradata.com/r/Enterprise_IntelliFlex_VMware/Database-Analytic-Functions/Model-Evaluation-Functions/TD_RegressionEvaluator) function. The model can be evaluated based on parameters such as `R2`, `RMSE`, `F_score`. + +``` +-- Evaluating the model +SELECT * FROM TD_RegressionEvaluator( +ON td_analytics_functions_demo.GLM_model_test_prediction AS InputTable +USING +ObservationColumn('cc_avg_bal') +PredictionColumn('prediction') +Metrics('RMSE','MAE','R2') +) AS dt; +``` + +![Evaluated GLM](../images/ml_model_evaluated.png) + +:::note +The purpose of this how-to is not to describe feature engineering but to demonstrate how we can leverage different Database Analytic Functions in Vantage. The model results might not be optimal and the process to make the best model is beyond the scope of this article. +::: + +## Summary + +In this quick start we have learned how to create ML models using Teradata Database Analytic Functions. We built our own database `td_analytics_functions_demo` with `customer`,`accounts`, `transactions` data from `val` database. We performed feature engineering by transforming the columns using `TD_OneHotEncodingFit`, `TD_ScaleFit` and `TD_ColumnTransformer`. We then used `TD_TrainTestSplit` for train test split. We trained our training dataset with `TD_GLM` model and scored our testing dataset. Finally we evaluated our scored results using `TD_RegressionEvaluator` function. + +## Further reading +* [Vantage Database Analytic Functions User Guide](https://docs.teradata.com/r/Enterprise_IntelliFlex_VMware/Database-Analytic-Functions/Introduction-to-Analytics-Database-Analytic-Functions) + + diff --git a/quickstarts/analyze-data/perform-time-series-analysis-using-teradata-vantage.md b/quickstarts/analyze-data/perform-time-series-analysis-using-teradata-vantage.md new file mode 100644 index 0000000000..b282c1163c --- /dev/null +++ b/quickstarts/analyze-data/perform-time-series-analysis-using-teradata-vantage.md @@ -0,0 +1,253 @@ +--- +sidebar_position: 5 +id: perform-time-series-analysis-using-teradata-vantage +author: Remi Turpaud +email: remi.turpaud@teradata.com +page_last_update: April 15th, 2022 +description: Perform time series analysis using Teradata Vantage +keywords: [data warehouses, analytics, teradata, vantage, time series, business intelligence, enterprise analytics, time series, time series analysis] +--- + +import ClearscapeDocsNote from '../_partials/vantage_clearscape_analytics.mdx' +import CommunityLink from '../_partials/community_link.mdx' + +# Perform time series analysis using Teradata Vantage + +## Overview + +Time series is series of data points indexed in time order. It is data continuously produced and collected by a wide range of applications and devices including but not limited to Internet of Things. Teradata Vantage offers various functionalities to simplify time series data analysis. + +## Prerequisites + +You need access to a Teradata Vantage instance. Times series functionalities and NOS are enabled in all Vantage editions from Vantage Express through Developer, DYI to Vantage as a Service starting from version 17.10. + + + +## Import data sets from AWS S3 using Vantage NOS + +Our sample data sets are available on S3 bucket and can be accessed from Vantage directly using Vantage NOS. Data is in CSV format and let's ingest them into Vantage for our time series analysis. + +Let's have a look at the data first. Below query will fetch 10 rows from S3 bucket. + +``` +SELECT TOP 10 * FROM ( + LOCATION='/s3/nos-demo-apj.s3.amazonaws.com/taxi/2014/11/data_2014-11-25.csv' +) AS d; +``` + +Here is what we've got: + +``` + +Location vendor_id pickup_datetime dropoff_datetime passenger_count trip_distance pickup_longitude pickup_latitude rate_code store_and_fwd_flag dropoff_longitude dropoff_latitude payment_type fare_amount surcharge mta_tax tip_amount tolls_amount total_amount +------------------------------------------------------------------ --------- ----------------- ----------------- ---------------- -------------- ----------------- ---------------- ---------- ------------------- ------------------ ----------------- ------------- ------------ ---------- -------- ---------- ------------ ------------ +/S3/s3.amazonaws.com/nyc-tlc/csv_backup/yellow_tripdata_2013-11.csv CMT 25/11/2013 15:18 25/11/2013 15:33 1 1 -73.992423 40.749517 1 N -73.98816 40.746557 CRD 10 0 0.5 2.22 0 12.72 +/S3/s3.amazonaws.com/nyc-tlc/csv_backup/yellow_tripdata_2013-11.csv CMT 25/11/2013 5:34 25/11/2013 5:48 1 3.6 -73.971555 40.794548 1 N -73.975399 40.755404 CRD 14.5 0.5 0.5 1 0 16.5 +/S3/s3.amazonaws.com/nyc-tlc/csv_backup/yellow_tripdata_2013-11.csv CMT 25/11/2013 8:31 25/11/2013 8:55 1 5.9 -73.94764 40.830465 1 N -73.972323 40.76332 CRD 21 0 0.5 3 0 24.5 +/S3/s3.amazonaws.com/nyc-tlc/csv_backup/yellow_tripdata_2013-11.csv CMT 25/11/2013 7:00 25/11/2013 7:04 1 1.2 -73.983357 40.767193 1 N -73.978394 40.75558 CRD 5.5 0 0.5 1 0 7 +/S3/s3.amazonaws.com/nyc-tlc/csv_backup/yellow_tripdata_2013-11.csv CMT 25/11/2013 15:24 25/11/2013 15:30 1 0.5 -73.982313 40.764827 1 N -73.982129 40.758889 CRD 5.5 0 0.5 3 0 9 +/S3/s3.amazonaws.com/nyc-tlc/csv_backup/yellow_tripdata_2013-11.csv CMT 25/11/2013 15:53 25/11/2013 16:00 1 0.6 -73.978104 40.752966 1 N -73.985756 40.762685 CRD 6 1 0.5 1 0 8.5 +/S3/s3.amazonaws.com/nyc-tlc/csv_backup/yellow_tripdata_2013-11.csv CMT 25/11/2013 6:49 25/11/2013 7:04 1 3.8 -73.976005 40.744481 1 N -74.016063 40.717298 CRD 14 0 0.5 2.9 0 17.4 +/S3/s3.amazonaws.com/nyc-tlc/csv_backup/yellow_tripdata_2013-11.csv CMT 25/11/2013 21:20 25/11/2013 21:26 1 1.1 -73.946371 40.775369 1 N -73.95309 40.785103 CRD 6.5 0.5 0.5 1.5 0 9 +/S3/s3.amazonaws.com/nyc-tlc/csv_backup/yellow_tripdata_2013-11.csv CMT 25/11/2013 10:02 25/11/2013 10:17 1 2.2 -73.952625 40.780962 1 N -73.98163 40.777978 CRD 12 0 0.5 2 0 14.5 +/S3/s3.amazonaws.com/nyc-tlc/csv_backup/yellow_tripdata_2013-11.csv CMT 25/11/2013 9:43 25/11/2013 10:02 1 3.3 -73.982013 40.762507 1 N -74.006854 40.719582 CRD 15 0 0.5 2 0 17.5 + +``` + +Let's extract the complete data and bring it into Vantage for further analysis. + +``` + +CREATE TABLE trip +( + vendor_id varchar(10) character set latin NOT casespecific, + rate_code integer, + pickup_datetime timestamp(6), + dropoff_datetime timestamp(6), + passenger_count smallint, + trip_distance float, + pickup_longitude float, + pickup_latitude float, + dropoff_longitude float, + dropoff_latitude float +) +NO PRIMARY INDEX ; + + + +INSERT INTO trip +SELECT TOP 200000 vendor_id , + rate_code, + pickup_datetime, + dropoff_datetime , + passenger_count, + trip_distance , + pickup_longitude, + pickup_latitude , + dropoff_longitude , + dropoff_latitude FROM ( + LOCATION='/s3/nos-demo-apj.s3.amazonaws.com/taxi/2014/11/data_2014-11-25.csv' +) AS d; + +``` + +Result: + +``` +200000 rows affected. +``` + +Vantage will now fetch the data from S3 and insert into trip table we just created. + +## Basic time series operations + +Now that we are familiar with the data set, we can use Vantage capabilities to quickly analyse the data set. First, let's identify how many passengers are being picked up by hour in the month of November. + +``` + +SELECT TOP 10 + $TD_TIMECODE_RANGE + ,begin($TD_TIMECODE_RANGE) time_bucket_start + ,sum(passenger_count) passenger_count +FROM trip +WHERE extract(month from pickup_datetime)=11 +GROUP BY TIME(HOURS(1)) +USING TIMECODE(pickup_datetime) +ORDER BY 1; + +``` + +For further reading on [GROUP BY TIME](https://www.docs.teradata.com/r/Teradata-VantageTM-Time-Series-Tables-and-Operations/July-2021/Time-Series-Aggregates-and-SELECT-Extensions/GROUP-BY-TIME-Clause). + +Result: + +``` +TIMECODE_RANGE time_bucket_start passenger_count +--------------------------------------------------------- --------------------------------- ---------------- +(2013-11-04 11:00:00.000000, 2013-11-04 12:00:00.000000) 2013-11-04 11:00:00.000000-05:00 4 +(2013-11-04 12:00:00.000000, 2013-11-04 13:00:00.000000) 2013-11-04 12:00:00.000000-05:00 2 +(2013-11-04 14:00:00.000000, 2013-11-04 15:00:00.000000) 2013-11-04 14:00:00.000000-05:00 5 +(2013-11-04 15:00:00.000000, 2013-11-04 16:00:00.000000) 2013-11-04 15:00:00.000000-05:00 2 +(2013-11-04 16:00:00.000000, 2013-11-04 17:00:00.000000) 2013-11-04 16:00:00.000000-05:00 9 +(2013-11-04 17:00:00.000000, 2013-11-04 18:00:00.000000) 2013-11-04 17:00:00.000000-05:00 11 +(2013-11-04 18:00:00.000000, 2013-11-04 19:00:00.000000) 2013-11-04 18:00:00.000000-05:00 41 +(2013-11-04 19:00:00.000000, 2013-11-04 20:00:00.000000) 2013-11-04 19:00:00.000000-05:00 2791 +(2013-11-04 20:00:00.000000, 2013-11-04 21:00:00.000000) 2013-11-04 20:00:00.000000-05:00 15185 +(2013-11-04 21:00:00.000000, 2013-11-04 22:00:00.000000) 2013-11-04 21:00:00.000000-05:00 27500 + + +``` + +Yes, this can also be achieved by extracting the hour from time and then aggregating - it's additional code/work, but can be done without timeseries specific functionality. + +But, now let's go a step further to identify how many passengers are being picked up and what is the average trip duration by vendor every 15 minutes in November. + +``` +SELECT TOP 10 + $TD_TIMECODE_RANGE, + vendor_id, + SUM(passenger_count), + AVG((dropoff_datetime - pickup_datetime ) MINUTE (4)) AS avg_trip_time_in_mins +FROM trip +GROUP BY TIME (MINUTES(15) AND vendor_id) +USING TIMECODE(pickup_datetime) +WHERE EXTRACT(MONTH FROM pickup_datetime)=11 +ORDER BY 1,2; +``` + +Result: + +``` + +TIMECODE_RANGE vendor_id passenger_count avg_trip_time_in_mins +-------------------------------------------------------- ---------- ---------------- ---------------------- +(2013-11-04 11:00:00.000000, 2013-11-04 11:15:00.000000) VTS 1 16 +(2013-11-04 11:15:00.000000, 2013-11-04 11:30:00.000000) VTS 1 10 +(2013-11-04 11:45:00.000000, 2013-11-04 12:00:00.000000) VTS 2 6 +(2013-11-04 12:00:00.000000, 2013-11-04 12:15:00.000000) VTS 1 11 +(2013-11-04 12:15:00.000000, 2013-11-04 12:30:00.000000) VTS 1 57 +(2013-11-04 14:15:00.000000, 2013-11-04 14:30:00.000000) VTS 1 3 +(2013-11-04 14:30:00.000000, 2013-11-04 14:45:00.000000) VTS 2 19 +(2013-11-04 14:45:00.000000, 2013-11-04 15:00:00.000000) VTS 2 9 +(2013-11-04 15:15:00.000000, 2013-11-04 15:30:00.000000) VTS 1 11 +(2013-11-04 15:30:00.000000, 2013-11-04 15:45:00.000000) VTS 1 31 + + +``` + +This is the power of Vantage time series functionality. Without needing complicated, cumbersome logic we are able to find average trip duration by vendor every 15 minutes just by modifying the group by time clause. Let's now look at how simple it is to build moving averages based on this. First, let's start by creating a view as below. + +``` +REPLACE VIEW NYC_taxi_trip_ts as +SELECT + $TD_TIMECODE_RANGE time_bucket_per + ,vendor_id + ,sum(passenger_count) passenger_cnt + ,avg(CAST((dropoff_datetime - pickup_datetime MINUTE(4) ) AS INTEGER)) avg_trip_time_in_mins +FROM trip +GROUP BY TIME (MINUTES(15) and vendor_id) +USING TIMECODE(pickup_datetime) +WHERE extract(month from pickup_datetime)=11 + +``` + +Let's calculate a 2 hours moving average on our 15-minutes time series. 2 hour is 8 * 15 minutes periods. + +``` +SELECT * FROM MovingAverage ( + ON NYC_taxi_trip_ts PARTITION BY vendor_id ORDER BY time_bucket_per + USING + MAvgType ('S') + WindowSize (8) + TargetColumns ('passenger_cnt') +) AS dt +WHERE begin(time_bucket_per)(date) = '2014-11-25' +ORDER BY vendor_id, time_bucket_per; +``` + + +Result: + +``` + +time_bucket_per vendor_id passenger_cnt avg_trip_time_in_mins passenger_cnt_smavg +--------------------------------------------------------- -------------- ---------------------- -------------------- -------------------- +(2013-11-04 14:45:00.000000, 2013-11-04 15:00:00.000000) VTS 2 9 1.375 +(2013-11-04 15:15:00.000000, 2013-11-04 15:30:00.000000) VTS 1 11 1.375 +(2013-11-04 15:30:00.000000, 2013-11-04 15:45:00.000000) VTS 1 31 1.375 +(2013-11-04 16:15:00.000000, 2013-11-04 16:30:00.000000) VTS 2 16 1.375 +(2013-11-04 16:30:00.000000, 2013-11-04 16:45:00.000000) VTS 1 3 1.375 +(2013-11-04 16:45:00.000000, 2013-11-04 17:00:00.000000) VTS 6 38 2 +(2013-11-04 17:15:00.000000, 2013-11-04 17:30:00.000000) VTS 2 29.5 2.125 +(2013-11-04 17:45:00.000000, 2013-11-04 18:00:00.000000) VTS 9 20.33333333 3 +(2013-11-04 18:00:00.000000, 2013-11-04 18:15:00.000000) VTS 6 23.4 3.5 +(2013-11-04 18:15:00.000000, 2013-11-04 18:30:00.000000) VTS 4 15.66666667 3.875 +(2013-11-04 18:30:00.000000, 2013-11-04 18:45:00.000000) VTS 8 24.5 4.75 +(2013-11-04 18:45:00.000000, 2013-11-04 19:00:00.000000) VTS 23 38.33333333 7.375 +(2013-11-04 19:00:00.000000, 2013-11-04 19:15:00.000000) VTS 195 26.61538462 31.625 +(2013-11-04 19:15:00.000000, 2013-11-04 19:30:00.000000) VTS 774 13.70083102 127.625 +(2013-11-04 19:30:00.000000, 2013-11-04 19:45:00.000000) VTS 586 12.38095238 200.625 +(2013-11-04 19:45:00.000000, 2013-11-04 20:00:00.000000) VTS 1236 15.54742097 354 +(2013-11-04 20:00:00.000000, 2013-11-04 20:15:00.000000) VTS 3339 11.78947368 770.625 +(2013-11-04 20:15:00.000000, 2013-11-04 20:30:00.000000) VTS 3474 10.5603396 1204.375 +(2013-11-04 20:30:00.000000, 2013-11-04 20:45:00.000000) VTS 3260 12.26484323 1610.875 +(2013-11-04 20:45:00.000000, 2013-11-04 21:00:00.000000) VTS 5112 12.05590062 2247 + + + +``` + +:::note +In addition to above time series operations, Vantage also provides a special time series tables with Primary Time Index (PTI). These are regular Vantage tables with PTI defined rather than a Primary Index (PI). Though tables with PTI are not mandatory for time series functionality/operations, PTI optimizes how the time series data is stored physically and hence improves performance considerably compared to regular tables. +::: + +## Summary + +In this quick start we have learnt how easy it is to analyse time series datasets using Vantage's time series capabilities. + +## Further reading +* [Teradata Vantage™ - Time Series Tables and Operations](https://docs.teradata.com/r/Teradata-VantageTM-Time-Series-Tables-and-Operations/July-2021/Introduction-to-Teradata-Time-Series-Tables-and-Operations) +* [NOS](https://quickstarts.teradata.com/nos.html) +* [Teradata Vantage™ - Native Object Store Getting Started Guide](https://docs.teradata.com/r/2mw8ooFr~xX0EaaGFaDW8A/root) + + diff --git a/quickstarts/analyze-data/sagemaker-with-teradata-vantage.md b/quickstarts/analyze-data/sagemaker-with-teradata-vantage.md new file mode 100644 index 0000000000..0ffdf6b00f --- /dev/null +++ b/quickstarts/analyze-data/sagemaker-with-teradata-vantage.md @@ -0,0 +1,177 @@ +--- +sidebar_position: 9 +author: Wenjie Tehan +email: wenjie.tehan@teradata.com +page_last_update: February 8th, 2022 +description: Use AWS SageMaker with Teradata Vantage. +keywords: [data warehouses, compute storage separation, teradata, vantage, cloud data platform, object storage, business intelligence, enterprise analytics, ai, artificial intelligence, aws sagemaker.] +--- + +import CommunityLinkPartial from '../_partials/community_link.mdx'; + +# Use AWS SageMaker with Teradata Vantage + + +### Overview + +This how-to will help you to integrate Amazon SageMaker with Teradata Vantage. The approach this guide explains is one of many potential approaches to integrate with the service. + +Amazon SageMaker provides a fully managed Machine Learning Platform. There are two use cases for Amazon SageMaker and Teradata: + +1. Data resides on Teradata Vantage and Amazon SageMaker will be used for both the Model definition and subsequent scoring. Under this use case Teradata will provide data into the Amazon S3 environment so that Amazon SageMaker can consume training and test data sets for the purpose of model development. Teradata would further make data available via Amazon S3 for subsequent scoring by Amazon SageMaker. Under this model Teradata is a data repository only. + +2. Data resides on Teradata Vantage and Amazon SageMaker will be used for the Model definition, and Teradata for the subsequent scoring. Under this use case Teradata will provide data into the Amazon S3 environment so that Amazon SageMaker can consume training and test data sets for the purpose of model development. Teradata will need to import the Amazon SageMaker model into a Teradata table for subsequent scoring by Teradata Vantage. Under this model Teradata is a data repository and a scoring engine. + +The first use case is discussed in this document. + +Amazon SageMaker consumes training and test data from an Amazon S3 bucket. This article describes how you can load Teradata analytics data sets into an Amazon S3 bucket. The data can then available to Amazon SageMaker to build and train machine learning models and deploy them into a production environment. + + +### Prerequisites + +import ClearscapeDocsNote from '../_partials/vantage_clearscape_analytics.mdx' + +* Access to a Teradata Vantage instance. + +* IAM permission to access Amazon S3 bucket, and to use Amazon SageMaker service. +* An Amazon S3 bucket to store training data. + +### Load data + +Amazon SageMaker trains data from an Amazon S3 bucket. Following are the steps to load training data from Vantage to an Amazon S3 bucket: + +1. Go to Amazon SageMaker console and create a notebook instance. See [Amazon SageMaker Developer Guide](https://docs.aws.amazon.com/sagemaker/latest/dg/gs-setup-working-env.html) for instructions on how to create a notebook instance:![Create notebook instance](../cloud-guides/images/sagemaker-with-teradata-vantage/create.notebook.png) + +2. Open your notebook instance: +![Open notebook instance](../cloud-guides/images/sagemaker-with-teradata-vantage/open.notebook.instance.png) + +3. Start a new file by clicking on `New -> conda_python3`: +![Start new file](../cloud-guides/images/sagemaker-with-teradata-vantage/start.new.file.png) + +4. Install Teradata Python library: + +``` python +!pip install teradataml +``` + +5. In a new cell and import additional libraries: + +``` python +import teradataml as tdml +from teradataml import create_context, get_context, remove_context +from teradataml.dataframe.dataframe import DataFrame +import pandas as pd +import boto3, os +``` + +6. In a new cell, connect to Teradata Vantage. Replace ``, ``, `` to match your Vantage environment: + +``` python +create_context(host = '', username = '', password = '') +``` + +7. Retrieve data rom the table where the training dataset resides using TeradataML DataFrame API: + +``` python +train_data = tdml.DataFrame('table_with_training_data') +trainDF = train_data.to_pandas() +``` + +8. Write data to a local file: + +``` python +trainFileName = 'train.csv' +trainDF.to_csv(trainFileName, header=None, index=False) +``` + +9. Upload the file to Amazon S3: + +``` python , id="sagemaker_first_usage", role="content-editable emits-gtm-events +bucket = 'sagedemo' +prefix = 'sagemaker/train' + +trainFile = open(trainFileName, 'rb') +boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, localFile)).upload_fileobj(trainFile) +``` + +### Train the model + +1. Select `Training jobs` on the left menu under `Training`, then click on `Create training job`: +![Create training job](../cloud-guides/images/sagemaker-with-teradata-vantage/create.training.job.png) + +2. At the `Create training job` window, fill in the `Job name` (e.g. `xgboost-bank`) and `Create a new role` for the IAM role. Choose `Any S3 bucket` for the Amazon S3 buckets and `Create role`: +![Create IAM role](../cloud-guides/images/sagemaker-with-teradata-vantage/create.iam.role.png) + +3. Back in the `Create training job` window, use `XGBoost` as the algorithm: +![Choose an algorithm](../cloud-guides/images/sagemaker-with-teradata-vantage/choose.an.algorithm.png) + +4. Use the default `ml.m4.xlarge` instance type, and 30GB of additional storage volume per instance. This is a short training job, shouldn't take more than 10 minutes. +![[Resource configuration](../cloud-guides/images/sagemaker-with-teradata-vantage/resource.configuration.png) + +5. Fill in following hyperparameters and leave everything else as default: + +``` +num_round=100 +silent=0 +eta=0.2 +gamma=4 +max_depth=5 +min_child_weight=6 +subsample=0.8 +objective='binary:logistic' +``` + +6. For `Input data configuration`, enter the Amazon S3 bucket where you stored your training data. Input mode is `File`. Content type is `csv`. `S3 location` is where the file uploaded to: +![Input data configuration](../cloud-guides/images/sagemaker-with-teradata-vantage/input.data.configuration.png) + +7. For `Output data configuration`, enter path where the output data will be stored: +![Output data configuration](../cloud-guides/images/sagemaker-with-teradata-vantage/output.data.configuration.png) + +8. Leave everything else as default, and click on “Create training job”. Detail instructions on how to configure the training job can be found in [Amazon SageMaker Developer Guide](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-mkt-algo-train.html#sagemaker-mkt-algo-train-console]. + +Once the training job's created, Amazon SageMaker launches the ML instances to train the model, and stores the resulting model artifacts and other output in the `Output data configuration` (`path//output` by default). + +### Deploy the model + +After you train your model, deploy it using a persistent endpoint + +### Create a model + +1. Select `Models` under `Inference` from the left panel, then `Create model`. Fill in the model name (e.g. `xgboost-bank`), and choose the IAM role you created from the previous step. +2. For `Container definition 1`, use `433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest` as `Location of inference code image`. `Location of model artifacts` is the output path of your training job +![Container definition 1](../cloud-guides/images/sagemaker-with-teradata-vantage/container.definition.1.png) +3. Leave everything else as default, then `Create model`. + +### Create an endpoint configuration + +1. Select the model you just created, then click on `Create endpoint configuration`: +![Create endpoint configuration](../cloud-guides/images/sagemaker-with-teradata-vantage/create.endpoint.configuration.png) + +2. Fill in the name (e.g. `xgboost-bank`) and use default for everything else. The model name and training job should be automatically populated for you. Click on `Create endpoint configuration`. + +### Create endpoint + +1. Select `Inference` -> `Models` from the left panel, select the model again, and click on `Create endpoint` this time: +![Create endpoint](../cloud-guides/images/sagemaker-with-teradata-vantage/create.endpoint.png) + +2. Fill in the name (e.g. `xgboost-bank`), and select `Use an existing endpoint configuration`: +![Attach endpoint configuration](../cloud-guides/images/sagemaker-with-teradata-vantage/attach.endpoint.configuration.png) + +3. Select the endpoint configuration created from last step, and click on `Select endpoint configuration`: +![Select endpoint configuration](../cloud-guides/images/sagemaker-with-teradata-vantage/select.endpoint.configuration.png) + +4. Leave everything else as default and click on `Create endpoint`. + +Now the model is deployed to the endpoint and can be used by client applications. + +### Summary + +This how-to demonstrated how to extract training data from Vantage and use it to train a model in Amazon SageMaker. The solution used a Jupyter notebook to extract data from Vantage and write it to an S3 bucket. A SageMaker training job read data from the S3 bucket and produced a model. The model was deployed to AWS as a service endpoint. + +### Further reading +* [API integration guide for AWS SageMaker](https://docs.teradata.com/r/Enterprise_IntelliFlex_VMware/Teradata-VantageTM-API-Integration-Guide-for-Cloud-Machine-Learning/Amazon-Web-Services) +* [Integrate Teradata Jupyter extensions with SageMaker notebook instance](https://quickstarts.teradata.com/cloud-guides/integrate-teradata-jupyter-extensions-with-sagemaker.html) + + + + diff --git a/quickstarts/analyze-data/sto.md b/quickstarts/analyze-data/sto.md new file mode 100644 index 0000000000..1a118a5f93 --- /dev/null +++ b/quickstarts/analyze-data/sto.md @@ -0,0 +1,250 @@ +--- +sidebar_position: 4 +id: sto +title: Run scripts on Vantage +author: Adam Tworkiewicz +email: adam.tworkiewicz@teradata.com +page_last_update: September 7th, 2021 +description: Run Applications on Teradata - use Script Table Operator to run applications on your data without data movement. +keywords: [data warehouses, compute storage separation, teradata, vantage, script table operator, cloud data platform, object storage, business intelligence, enterprise analytics] +--- + +import ClearscapeDocsNote from '../_partials/vantage_clearscape_analytics.mdx' +import CommunityLink from '../_partials/community_link.mdx' + +# Run scripts on Vantage + +## Overview + +Sometimes, you need to apply complex logic to your data that can't be easily expressed in SQL. One option is to wrap your logic in a User Defined Function (UDF). What if you already have this logic coded in a language that is not supported by UDF? Script Table Operator is a Vantage feature that allows you to bring your logic to the data and run it on Vantage. The advantage of this approach is that you don't have to retrieve data from Vantage to operate on it. Also, by running your data applications on Vantage, you leverage its parallel nature. You don't have to think how your applications will scale. You can let Vantage take care of it. + +## Prerequisites + +You need access to a Teradata Vantage instance. + + + +## Hello World + +Let's start with something simple. What if you wanted the database to print "Hello World"? + +``` +SELECT * +FROM + SCRIPT( + SCRIPT_COMMAND('echo Hello World!') + RETURNS ('Message varchar(512)')); +``` + +Here is what I've got: +``` +Message +------------ +Hello World! +Hello World! +``` + +Let's analyze what just happened here. The SQL includes `echo Hello World!`. This is a Bash command. OK, so now we know how to run Bash commands. But why did we get 2 rows and not one? That's because our simple script was run once on each AMP and I happen to have 2 AMPs: + +``` +-- Teradata magic that returns the number of AMPs in a system +SELECT hashamp()+1 AS number_of_amps; +``` + +Returns: +``` +number_of_amps +-------------- + 2 +``` + +This simple script demonstrates the idea behind the Script Table Operator (STO). You provide your script and the database runs it in parallel, once for each AMP. This is an attractive model in case you have transformation logic in a script and a lot of data to process. Normally, you would need to build concurrency into your application. By letting STO do it, you let Teradata select the right concurrency level for your data. + +## Supported languages + +OK, so we did `echo` in Bash but Bash is hardly a productive environment to express complex logic. What other languages are supported then? The good news is that any binary that can run on Vantage nodes can be used in STO. Remember, that the binary and all its dependencies need to be installed on all your Vantage nodes. In practice, it means that your options will be limited to what your administrator is willing and able to maintain on your servers. Python is a very popular choice. + +## Uploading scripts + +Ok, Hello World is super exciting, but what if we have existing logic in a large file. Surely, you don't want to paste your entire script and escape quotes in an SQL query. We solve the script upload issue with the User Installed Files (UIF) feature. + +Say you have `helloworld.py` script with the following content: + +``` +print("Hello World!") +``` + +Let's assume the script is on your local machine at `/tmp/helloworld.py`. + +First, we need to setup permissions in Vantage. We are going to do this using a new database to keep it clean. + +``` +-- Create a new database called sto +CREATE DATABASE STO +AS PERMANENT = 60e6, -- 60MB + SPOOL = 120e6; -- 120MB + +-- Allow dbc user to create scripts in database STO +GRANT CREATE EXTERNAL PROCEDURE ON STO to dbc; +``` + +You can upload the script to Vantage using the following procedure call: + +``` +call SYSUIF.install_file('helloworld', + 'helloworld.py', 'cz!/tmp/helloworld.py'); +``` + +Now that the script has been uploaded, you can call it like this: + +``` +-- We switch to STO database +DATABASE STO + +-- We tell Vantage where to look for the script. This can be +-- any string and it will create a symbolic link to the directory +-- where our script got uploaded. By convention, we use the +-- database name. +SET SESSION SEARCHUIFDBPATH = sto; + +-- We now call the script. Note, how we use a relative path that +-- starts with `./sto/`, which is where SEARCHUIFDBPATH +-- is pointing. +SELECT * +FROM SCRIPT( + SCRIPT_COMMAND('python3 ./sto/helloworld.py') + RETURNS ('Message varchar(512)')); +``` + +The last call should return: +``` +Message +------------ +Hello World! +Hello World! +``` + +That was a lot of work and we are still at Hello World. Let's try to pass some data into `SCRIPT`. + +## Passing data stored in Vantage to SCRIPT + +So far, we have been using `SCRIPT` operator to run standalone scripts. But the main purpose to run scripts on Vantage is to process data that is in Vantage. Let's see how we can retrieve data from Vantage and pass it to `SCRIPT`. + +We will start with creating a table with a few rows. + +``` +-- Switch to STO database. +DATABASE STO + +-- Create a table with a few urls +CREATE TABLE urls(url varchar(10000)); +INS urls('https://www.google.com/finance?q=NYSE:TDC'); +INS urls('http://www.ebay.com/sch/i.html?_trksid=p2050601.m570.l1313.TR0.TRC0.H0.Xteradata+merchandise&_nkw=teradata+merchandise&_sacat=0&_from=R40'); +INS urls('https://www.youtube.com/results?search_query=teradata%20commercial&sm=3'); +INS urls('https://www.contrivedexample.com/example?mylist=1&mylist=2&mylist=...testing'); +``` + +We will use the following script to parse out query parameters: + +``` +from urllib.parse import urlparse +from urllib.parse import parse_qsl +import sys + +for line in sys.stdin: + # remove leading and trailing whitespace + url = line.strip() + parsed_url = urlparse(url) + query_params = parse_qsl(parsed_url.query) + + for element in query_params: + print("\t".join(element)) +``` + +Note, how the scripts assumes that urls will be fed into `stdin` one by one, line by line. Also, note how it prints results line by line, using the tab character as a delimiter between values. + +Let's install the script. Here, we assume that the script file is at `/tmp/urlparser.py` on our local machine: +``` +CALL SYSUIF.install_file('urlparser', + 'urlparser.py', 'cz!/tmp/urlparser.py'); +``` + +With the script installed, we will now retrieve data from `urls` table and feed it into the script to retrieve query parameters: +``` +-- We inform Vantage to create a symbolic link from the UIF directory to ./sto/ +SET SESSION SEARCHUIFDBPATH = sto ; + +SELECT * + FROM SCRIPT( + ON(SELECT url FROM urls) + SCRIPT_COMMAND('python3 ./sto/urlparser.py') + RETURNS ('param_key varchar(512)', 'param_value varchar(512)')); +``` + +As a result, we get query params and their values. There are as many rows as key/value pairs. Also, since we inserted a tab between the key and the value output in the script, we get 2 columns from STO. +``` +param_key |param_value +------------+----------------------------------------------------- +q |NYSE:TDC +_trksid |p2050601.m570.l1313.TR0.TRC0.H0.Xteradata merchandise +search_query|teradata commercial +_nkw |teradata merchandise +sm |3 +_sacat |0 +mylist |1 +_from |R40 +mylist |2 +mylist |...testing +``` + +## Inserting SCRIPT output into a table + +We have learned how to take data from Vantage, pass it to a script and get output. Is there an easy way to store this output in a table? Sure, there is. We can combine the select above with `CREATE TABLE` statement: + +``` +-- We inform Vantage to create a symbolic link from the UIF directory to ./sto/ +SET SESSION SEARCHUIFDBPATH = sto ; + +CREATE MULTISET TABLE + url_params(param_key, param_value) +AS ( + SELECT * + FROM SCRIPT( + ON(SELECT url FROM urls) + SCRIPT_COMMAND('python3 ./sto/urlparser.py') + RETURNS ('param_key varchar(512)', 'param_value varchar(512)')) +) WITH DATA +NO PRIMARY INDEX; +``` + +Now, let's inspect the contents of `url_params` table: + +``` +SELECT * FROM url_params; +``` + +You should see the following output: +``` +param_key |param_value +------------+----------------------------------------------------- +q |NYSE:TDC +_trksid |p2050601.m570.l1313.TR0.TRC0.H0.Xteradata merchandise +search_query|teradata commercial +_nkw |teradata merchandise +sm |3 +_sacat |0 +mylist |1 +_from |R40 +mylist |2 +mylist |...testing +``` + +## Summary + +In this quick start we have learned how to run scripts against data in Vantage. We ran scripts using Script Table Operator (STO). The operator allows us to bring logic to the data. It offloads concurrency considerations to the database by running our scripts in parallel, one per AMP. All you need to do is provide a script and the database will execute it in parallel. + +## Further reading +* [Teradata Vantage™ - SQL Operators and User-Defined Functions - SCRIPT](https://docs.teradata.com/r/9VmItX3V2Ni9Ts70HbDzVg/CBAaRxUyOdF0t1SQIuXeug) +* [R and Python Analytics with SCRIPT Table Operator](https://docs.teradata.com/v/u/Orange-Book/R-and-Python-Analytics-with-SCRIPT-Table-Operator-Orange-Book-4.3.1) + + \ No newline at end of file diff --git a/quickstarts/analyze-data/use-teradata-vantage-with-azure-machine-learning-studio.md b/quickstarts/analyze-data/use-teradata-vantage-with-azure-machine-learning-studio.md new file mode 100644 index 0000000000..d4ed95c455 --- /dev/null +++ b/quickstarts/analyze-data/use-teradata-vantage-with-azure-machine-learning-studio.md @@ -0,0 +1,209 @@ +--- +sidebar_position: 10 +author: Rupal Shah +email: rupal.shah@teradata.com +page_last_update: February 14th, 2022 +description: Use Teradata Vantage with Azure Machine Learning Studio. +keywords: [data warehouses, compute storage separation, teradata, vantage, cloud data platform, data cloud, machine learning, azure, azure machine learning studio] +--- + +# Use Teradata Vantage with Azure Machine Learning Studio + +### Overview + +[Azure Machine Learning (ML) Studio](https://docs.microsoft.com/en-us/azure/machine-learning/studio/what-is-ml-studio) is a collaborative, drag-and-drop tool you can use to build, test, and deploy predictive analytics solutions on your data. ML Studio can consume data from Azure Blob Storage. This getting started guide will show how you can copy Teradata Vantage data sets to a Blob Storage using ML Studio 'built-in' Jupter Notebook feature. The data can then be used by ML Studio to build and train machine learning models and deploy them into a production environment. + + +![](../cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image2.png) + + +### Prerequisites + +import ClearscapeDocsNote from '../_partials/vantage_clearscape_analytics.mdx' + +* Access to a Teradata Vantage instance. + +* Azure subscription or create [free account](https://azure.microsoft.com/free) +* [Azure ML Studio workspace](https://docs.microsoft.com/en-us/azure/machine-learning/studio/create-workspace) +* (Optional) Download [AdventureWorks DW 2016 database](https://docs.microsoft.com/en-us/sql/samples/adventureworks-install-configure?view=sql-server-2017) (i.e. _'Training the Model'_ section) + * Restore and copy _'vTargetMail'_ table from SQL Server to Teradata Vantage + + +### Procedure + +### Initial setup + + +* During ML Studio workspace creation, you may need to create 'new' storage account unless you have one in current availability locations and choose *DEVTEST Standard* for *Web service plan* for this getting started guide. Logon to [Azure portal](https://portal.azure.com), open your storage account and create a *container* if one does not exist already. +![](../cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image3.png) + +* Copy your *storage account name* and *key* to notepad which we will use for Python3 Notebook to access your Azure Blob Storage account. +![](../cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image4.png) + +* Finally, open *Configuration* property and set *_'Secure transfer required'_* to *_Disabled_* to allow ML Studio Import Data module to access blob storage account. +![](../cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image5.png) + +### Load data + +To get the data to ML Studio, we first need to load data from Teradata Vantage to a Azure Blob Storage. We will create a ML Jupyter Notebook, install Python packages to connect to Teradata and save data to Azure Blob Storage, + +Logon to [Azure portal](https://portal.azure.com), go to to your *ML Studio workspace* and [Launch Machine Learning Studio](https://studio.azureml.net) and *Sign In.* + +1. You should see the following screen and click on *Notebooks,* ensure you are in the right region/ workspace and click on Notebook *New* +![](../cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image6.png) + +2. Choose *Python3* and *name* your notebook instance +![](../cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image7.png) + +3. In your jupyter notebook instance, install [Teradata Vantage Python package for Advanced Analytics](https://pypi.org/project/teradataml): + +``` python +pip install teradataml +``` + +:::note +There is no validation between Microsoft Azure ML Studio and Teradata Vantage Python package. +::: + +4. Install [Microsoft Azure Storage Blob Client Library for Python](https://pypi.org/project/azure-storage-blob): + +``` python +!pip install azure-storage-blob +``` + +5. Import the following libraries: + +``` python +import teradataml as tdml +from teradataml import create_context, get_context, remove_context +from teradataml.dataframe.dataframe import DataFrame +import pandas as pd +from azure.storage.blob import (BlockBlobService) +``` + +6. Connect to Teradata using command: + +``` python +create_context(host = '', username = '', password = '') +``` + +7. Retrieve Data using Teradata Python DataFrame module: + +``` python +train_data = DataFrame.from_table("") +``` + +8. Convert Teradata DataFrame to Panda DataFrame: + +``` python +trainDF = train_data.to_pandas() +``` + +9. Convert data to CSV: + +``` python +trainDF = trainDF.to_csv(head=True,index=False) +``` + +10. Assign variables for Azue Blob Storage account name, key and container name: + +``` python +accountName="" +accountKey="" +containerName="mldata" +``` + +11. Upload file to Azure Blob Storage: + +``` python , id="azure_ml_studio_first_config", role="content-editable emits-gtm-events +blobService = BlockBlobService(account_name=accountName, account_key=accountKey) +blobService.create_blob_from_text(containerNAme, 'vTargetMail.csv', trainDF) +``` + +12. Logon to Azure portal, open blob storage account to view uploaded file: +![](../cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image17.png) + +### Train the model + +We will use the existing [Analyze data with Azure Machine Learning](https://docs.microsoft.com/en-us/azure/sql-data-warehouse/sql-data-warehouse-get-started-analyze-with-azure-machine-learning) article to build a predictive machine learning model based on data from Azure Blob Storage. We will build a targeted marketing campaign for Adventure Works, the bike shop, by predicting if a customer is likely to buy a bike or not. + +#### Import data + +The data is on Azure Blob Storage file called `vTargetMail.csv` which we copied in the section above. + +1. Sign into [Azure Machine Learning studio](https://studio.azureml.net) and click on **Experiments**. +2. Click **+NEW** on the bottom left of the screen and select **Blank Experiment**. +3. Enter a name for your experiment: Targeted Marketing. +4. Drag **Import data** module under **Data Input and output** from the modules pane into the canvas. +5. Specify the details of your Azure Blob Storage (account name, key and container name) in the Properties pane. + +Run the experiment by clicking *Run* under the experiment canvas. + +![](../cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image18.png) + +After the experiment finishes running successfully, click the output port at the bottom of the Import Data module and select *Visualize* to see the imported data. + +![](../cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image19.png) + + +#### Clean the data + +To clean the data, drop some columns that are not relevant for the model. To do this: + +1. Drag *Select Columns in Dataset* module under *Data Transformation < Manipulation* into the canvas. Connect this module to the *Import Data* module. +2. Click *Launch column selector* in Properties pane to specify which columns you wish to drop. +![](../cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image20.png) +3. Exclude two columns: *CustomerAlternateKey* and *GeographyKey*. +![](../cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image21.png) + +#### Build the model + +We will split the data 80-20: 80% to train a machine learning model and 20% to test the model. We will make use of the "Two-Class" algorithms for this binary classification problem. + +1. Drag **SplitData** module into the canvas and connect with 'Select Columns in DataSet'. +2. In the properties pane, enter 0.8 for Fraction of rows in the first output dataset. +![](../cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image22.png) + +3. Search and drag **Two-Class Boosted Decision Tree** module into the canvas. +4. Search and drag **Train Model** module into the canvas and specify inputs by connecting it to the **Two-Class Boosted Decision Tree** (ML algorithm) and **Split** **Data** (data to train the algorithm on) modules. +![](../cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image23.png) + +5. Then, click *Launch column selector* in the Properties pane. Select the *BikeBuyer* column as the column to predict. +![](../cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image24.png) + +#### Score the model + +Now, we will test how the model performs on test data. We will compare the algorithm of our choice with a different algorithm to see which performs better. + +1. Drag **Score Model** module into the canvas and connect it to **Train Model** and **Split Data** modules. ++ + +![](../cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image25.png) + +2. Search and drag **Two-Class Bayes Point Machine** into the experiment canvas. We will compare how this algorithm performs in comparison to the Two-Class Boosted Decision Tree. +3. Copy and Paste the modules Train Model and Score Model in the canvas. +4. Search and drag **Evaluate Model** module into the canvas to compare the two algorithms. +5. **Run** the experiment. +![](../cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image26.png) +6. Click the output port at the bottom of the Evaluate Model module and click Visualize. +![](../cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image27.png) + + +The metrics provided are the ROC curve, precision-recall diagram and lift curve. Looking at these metrics, we can see that the first model performed better than the second one. To look at the what the first model predicted, click on output port of the Score Model and click Visualize. + +![](../cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image28.png) + +You will see two more columns added to your test dataset. +1. Scored Probabilities: the likelihood that a customer is a bike buyer. +2. Scored Labels: the classification done by the model - bike buyer (1) or not (0). This probability threshold for labeling is set to 50% and can be adjusted. + +Comparing the column BikeBuyer (actual) with the Scored Labels (prediction), you can see how well the model has performed. As next steps, you can use this model to make predictions for new customers and publish this model as a web service or write results back to SQL Data Warehouse. + +### Further reading + +* To learn more about building predictive machine learning models, refer to [+++Introduction to Machine Learning on Azure+++](https://azure.microsoft.com/documentation/articles/machine-learning-what-is-machine-learning). +* For large data set copies, consider using the [Teradata Access Module for Azure](https://docs.teradata.com/reader/p~0sSD4zl4K8YPbEGnM3Rg/TTu_WJMMIpo2TEaxFMFopQ) that interfaces between the Teradata Parallel Transporter load/unload operators and Azure Blob Storage. + +import CommunityLinkPartial from '../_partials/community_link.mdx'; + + diff --git a/quickstarts/business-intelligence/images/connect-power-bi/power.bi.database.picker.png b/quickstarts/business-intelligence/images/connect-power-bi/power.bi.database.picker.png new file mode 100644 index 0000000000..c4e958b608 Binary files /dev/null and b/quickstarts/business-intelligence/images/connect-power-bi/power.bi.database.picker.png differ diff --git a/quickstarts/business-intelligence/images/connect-power-bi/power.bi.elements.png b/quickstarts/business-intelligence/images/connect-power-bi/power.bi.elements.png new file mode 100644 index 0000000000..dabf473c18 Binary files /dev/null and b/quickstarts/business-intelligence/images/connect-power-bi/power.bi.elements.png differ diff --git a/quickstarts/business-intelligence/images/connect-power-bi/power.bi.get.data.menu.png b/quickstarts/business-intelligence/images/connect-power-bi/power.bi.get.data.menu.png new file mode 100644 index 0000000000..011eb9633b Binary files /dev/null and b/quickstarts/business-intelligence/images/connect-power-bi/power.bi.get.data.menu.png differ diff --git a/quickstarts/business-intelligence/images/connect-power-bi/power.bi.icon.png b/quickstarts/business-intelligence/images/connect-power-bi/power.bi.icon.png new file mode 100644 index 0000000000..347f21a34b Binary files /dev/null and b/quickstarts/business-intelligence/images/connect-power-bi/power.bi.icon.png differ diff --git a/quickstarts/business-intelligence/images/connect-power-bi/power.bi.ldap.png b/quickstarts/business-intelligence/images/connect-power-bi/power.bi.ldap.png new file mode 100644 index 0000000000..ea85568605 Binary files /dev/null and b/quickstarts/business-intelligence/images/connect-power-bi/power.bi.ldap.png differ diff --git a/quickstarts/business-intelligence/images/connect-power-bi/power.bi.navigator.png b/quickstarts/business-intelligence/images/connect-power-bi/power.bi.navigator.png new file mode 100644 index 0000000000..acde67caf8 Binary files /dev/null and b/quickstarts/business-intelligence/images/connect-power-bi/power.bi.navigator.png differ diff --git a/quickstarts/business-intelligence/images/connect-power-bi/power.bi.overview.blocks.png b/quickstarts/business-intelligence/images/connect-power-bi/power.bi.overview.blocks.png new file mode 100644 index 0000000000..f611891f52 Binary files /dev/null and b/quickstarts/business-intelligence/images/connect-power-bi/power.bi.overview.blocks.png differ diff --git a/quickstarts/business-intelligence/images/connect-power-bi/power.bi.publish.png b/quickstarts/business-intelligence/images/connect-power-bi/power.bi.publish.png new file mode 100644 index 0000000000..cbc98b112c Binary files /dev/null and b/quickstarts/business-intelligence/images/connect-power-bi/power.bi.publish.png differ diff --git a/quickstarts/business-intelligence/images/connect-power-bi/power.bi.report.png b/quickstarts/business-intelligence/images/connect-power-bi/power.bi.report.png new file mode 100644 index 0000000000..4625687010 Binary files /dev/null and b/quickstarts/business-intelligence/images/connect-power-bi/power.bi.report.png differ diff --git a/quickstarts/business-intelligence/images/connect-power-bi/power.bi.server.connect.png b/quickstarts/business-intelligence/images/connect-power-bi/power.bi.server.connect.png new file mode 100644 index 0000000000..17f82fae55 Binary files /dev/null and b/quickstarts/business-intelligence/images/connect-power-bi/power.bi.server.connect.png differ diff --git a/quickstarts/business-intelligence/images/connect-power-bi/power.bi.splash.screen.png b/quickstarts/business-intelligence/images/connect-power-bi/power.bi.splash.screen.png new file mode 100644 index 0000000000..964d8ce7db Binary files /dev/null and b/quickstarts/business-intelligence/images/connect-power-bi/power.bi.splash.screen.png differ diff --git a/quickstarts/business-intelligence/images/connect-power-bi/power.bi.success.png b/quickstarts/business-intelligence/images/connect-power-bi/power.bi.success.png new file mode 100644 index 0000000000..8247465c1c Binary files /dev/null and b/quickstarts/business-intelligence/images/connect-power-bi/power.bi.success.png differ diff --git a/quickstarts/business-intelligence/images/connect-power-bi/power.bi.workspace.png b/quickstarts/business-intelligence/images/connect-power-bi/power.bi.workspace.png new file mode 100644 index 0000000000..067d768e50 Binary files /dev/null and b/quickstarts/business-intelligence/images/connect-power-bi/power.bi.workspace.png differ diff --git a/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image1.wmf b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image1.wmf new file mode 100644 index 0000000000..0fafe3580b Binary files /dev/null and b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image1.wmf differ diff --git a/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image10.png b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image10.png new file mode 100644 index 0000000000..00918066a2 Binary files /dev/null and b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image10.png differ diff --git a/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image11.png b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image11.png new file mode 100644 index 0000000000..9b700fd8e5 Binary files /dev/null and b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image11.png differ diff --git a/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image12.png b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image12.png new file mode 100644 index 0000000000..733f9cb2b6 Binary files /dev/null and b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image12.png differ diff --git a/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image13.png b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image13.png new file mode 100644 index 0000000000..acf01ae294 Binary files /dev/null and b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image13.png differ diff --git a/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image14.png b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image14.png new file mode 100644 index 0000000000..c517003872 Binary files /dev/null and b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image14.png differ diff --git a/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image15.png b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image15.png new file mode 100644 index 0000000000..3eb1b859d7 Binary files /dev/null and b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image15.png differ diff --git a/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image16.png b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image16.png new file mode 100644 index 0000000000..67d7b50ba6 Binary files /dev/null and b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image16.png differ diff --git a/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image17.png b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image17.png new file mode 100644 index 0000000000..832845c073 Binary files /dev/null and b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image17.png differ diff --git a/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image18.png b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image18.png new file mode 100644 index 0000000000..86f6dbf4f6 Binary files /dev/null and b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image18.png differ diff --git a/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image19.png b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image19.png new file mode 100644 index 0000000000..c6d63cf64e Binary files /dev/null and b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image19.png differ diff --git a/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image2.png b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image2.png new file mode 100644 index 0000000000..b8dfb1371d Binary files /dev/null and b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image2.png differ diff --git a/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image20.png b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image20.png new file mode 100644 index 0000000000..183de648a6 Binary files /dev/null and b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image20.png differ diff --git a/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image21.png b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image21.png new file mode 100644 index 0000000000..b359c44a20 Binary files /dev/null and b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image21.png differ diff --git a/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image22.png b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image22.png new file mode 100644 index 0000000000..7cfd354745 Binary files /dev/null and b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image22.png differ diff --git a/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image23.png b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image23.png new file mode 100644 index 0000000000..d645ec2600 Binary files /dev/null and b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image23.png differ diff --git a/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image24.png b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image24.png new file mode 100644 index 0000000000..d0531eba37 Binary files /dev/null and b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image24.png differ diff --git a/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image25.png b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image25.png new file mode 100644 index 0000000000..c2c3b85ec0 Binary files /dev/null and b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image25.png differ diff --git a/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image26.png b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image26.png new file mode 100644 index 0000000000..ef54a7aa72 Binary files /dev/null and b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image26.png differ diff --git a/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image27.png b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image27.png new file mode 100644 index 0000000000..4d8396b4de Binary files /dev/null and b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image27.png differ diff --git a/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image28.png b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image28.png new file mode 100644 index 0000000000..4c185dbc0c Binary files /dev/null and b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image28.png differ diff --git a/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image3.png b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image3.png new file mode 100644 index 0000000000..26a1c53744 Binary files /dev/null and b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image3.png differ diff --git a/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image4.png b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image4.png new file mode 100644 index 0000000000..3a841281ac Binary files /dev/null and b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image4.png differ diff --git a/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image5.png b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image5.png new file mode 100644 index 0000000000..c5f16aa448 Binary files /dev/null and b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image5.png differ diff --git a/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image6.png b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image6.png new file mode 100644 index 0000000000..ac3374293c Binary files /dev/null and b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image6.png differ diff --git a/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image7.png b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image7.png new file mode 100644 index 0000000000..7346beb27b Binary files /dev/null and b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image7.png differ diff --git a/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image8.png b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image8.png new file mode 100644 index 0000000000..62fa1c1591 Binary files /dev/null and b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image8.png differ diff --git a/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image9.png b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image9.png new file mode 100644 index 0000000000..30e7317a53 Binary files /dev/null and b/quickstarts/cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image9.png differ diff --git a/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Buckets-1.PNG b/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Buckets-1.PNG new file mode 100644 index 0000000000..76345834b3 Binary files /dev/null and b/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Buckets-1.PNG differ diff --git a/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Buckets-2.PNG b/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Buckets-2.PNG new file mode 100644 index 0000000000..88b8009e4a Binary files /dev/null and b/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Buckets-2.PNG differ diff --git a/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Buckets-3.PNG b/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Buckets-3.PNG new file mode 100644 index 0000000000..47ea932a0b Binary files /dev/null and b/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Buckets-3.PNG differ diff --git a/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Cat-1.PNG b/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Cat-1.PNG new file mode 100644 index 0000000000..ee9c0a0e9b Binary files /dev/null and b/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Cat-1.PNG differ diff --git a/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Cat-2.PNG b/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Cat-2.PNG new file mode 100644 index 0000000000..43859d5b81 Binary files /dev/null and b/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Cat-2.PNG differ diff --git a/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Glue-1.PNG b/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Glue-1.PNG new file mode 100644 index 0000000000..0a58d302f0 Binary files /dev/null and b/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Glue-1.PNG differ diff --git a/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Glue-2.PNG b/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Glue-2.PNG new file mode 100644 index 0000000000..365acc592f Binary files /dev/null and b/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Glue-2.PNG differ diff --git a/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Glue-3.PNG b/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Glue-3.PNG new file mode 100644 index 0000000000..ec5fb8e116 Binary files /dev/null and b/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Glue-3.PNG differ diff --git a/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Glue-script-1.PNG b/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Glue-script-1.PNG new file mode 100644 index 0000000000..f306fa1911 Binary files /dev/null and b/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Glue-script-1.PNG differ diff --git a/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Glue-script-2.PNG b/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Glue-script-2.PNG new file mode 100644 index 0000000000..70b14bc60d Binary files /dev/null and b/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Glue-script-2.PNG differ diff --git a/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Glue-script-3.PNG b/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Glue-script-3.PNG new file mode 100644 index 0000000000..eeaeb18363 Binary files /dev/null and b/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Glue-script-3.PNG differ diff --git a/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Glue-script-4.PNG b/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Glue-script-4.PNG new file mode 100644 index 0000000000..920d304df8 Binary files /dev/null and b/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Glue-script-4.PNG differ diff --git a/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Glue-script-5.PNG b/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Glue-script-5.PNG new file mode 100644 index 0000000000..a291096bd2 Binary files /dev/null and b/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Glue-script-5.PNG differ diff --git a/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Results.PNG b/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Results.PNG new file mode 100644 index 0000000000..014a9e0fdb Binary files /dev/null and b/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Results.PNG differ diff --git a/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Role-1.PNG b/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Role-1.PNG new file mode 100644 index 0000000000..d689b918f2 Binary files /dev/null and b/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Role-1.PNG differ diff --git a/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Role-2.PNG b/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Role-2.PNG new file mode 100644 index 0000000000..21e9c15516 Binary files /dev/null and b/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Role-2.PNG differ diff --git a/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Role-3.PNG b/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Role-3.PNG new file mode 100644 index 0000000000..c7afa089e0 Binary files /dev/null and b/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Role-3.PNG differ diff --git a/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Role-4.PNG b/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Role-4.PNG new file mode 100644 index 0000000000..c7d066d1cc Binary files /dev/null and b/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Role-4.PNG differ diff --git a/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/secret-1.PNG b/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/secret-1.PNG new file mode 100644 index 0000000000..acf39c710b Binary files /dev/null and b/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/secret-1.PNG differ diff --git a/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/secret-2.PNG b/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/secret-2.PNG new file mode 100644 index 0000000000..066121b438 Binary files /dev/null and b/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/secret-2.PNG differ diff --git a/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/secret-3.PNG b/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/secret-3.PNG new file mode 100644 index 0000000000..1eb88d5514 Binary files /dev/null and b/quickstarts/cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/secret-3.PNG differ diff --git a/quickstarts/cloud-guides/images/integrate-teradata-jupyter-extensions-with-google-vertex-ai/vertex.create.notebook.startupscript.png b/quickstarts/cloud-guides/images/integrate-teradata-jupyter-extensions-with-google-vertex-ai/vertex.create.notebook.startupscript.png new file mode 100644 index 0000000000..2f8b74d145 Binary files /dev/null and b/quickstarts/cloud-guides/images/integrate-teradata-jupyter-extensions-with-google-vertex-ai/vertex.create.notebook.startupscript.png differ diff --git a/quickstarts/cloud-guides/images/integrate-teradata-jupyter-extensions-with-google-vertex-ai/vertex.custom.container.png b/quickstarts/cloud-guides/images/integrate-teradata-jupyter-extensions-with-google-vertex-ai/vertex.custom.container.png new file mode 100644 index 0000000000..e683e374db Binary files /dev/null and b/quickstarts/cloud-guides/images/integrate-teradata-jupyter-extensions-with-google-vertex-ai/vertex.custom.container.png differ diff --git a/quickstarts/cloud-guides/images/integrate-teradata-jupyter-extensions-with-google-vertex-ai/vertex.open.notebook.png b/quickstarts/cloud-guides/images/integrate-teradata-jupyter-extensions-with-google-vertex-ai/vertex.open.notebook.png new file mode 100644 index 0000000000..9d3f58e8c1 Binary files /dev/null and b/quickstarts/cloud-guides/images/integrate-teradata-jupyter-extensions-with-google-vertex-ai/vertex.open.notebook.png differ diff --git a/quickstarts/cloud-guides/images/integrate-teradata-jupyter-extensions-with-sagemaker/sagemaker.notebook.create.lifecycle.config.png b/quickstarts/cloud-guides/images/integrate-teradata-jupyter-extensions-with-sagemaker/sagemaker.notebook.create.lifecycle.config.png new file mode 100644 index 0000000000..dcf67cee3f Binary files /dev/null and b/quickstarts/cloud-guides/images/integrate-teradata-jupyter-extensions-with-sagemaker/sagemaker.notebook.create.lifecycle.config.png differ diff --git a/quickstarts/cloud-guides/images/integrate-teradata-jupyter-extensions-with-sagemaker/sagemaker.notebook.create.notebook.instance.png b/quickstarts/cloud-guides/images/integrate-teradata-jupyter-extensions-with-sagemaker/sagemaker.notebook.create.notebook.instance.png new file mode 100644 index 0000000000..3fa775e8e5 Binary files /dev/null and b/quickstarts/cloud-guides/images/integrate-teradata-jupyter-extensions-with-sagemaker/sagemaker.notebook.create.notebook.instance.png differ diff --git a/quickstarts/cloud-guides/images/integrate-teradata-jupyter-extensions-with-sagemaker/sagemaker.notebook.notebook.inservice.png b/quickstarts/cloud-guides/images/integrate-teradata-jupyter-extensions-with-sagemaker/sagemaker.notebook.notebook.inservice.png new file mode 100644 index 0000000000..67cddb7d17 Binary files /dev/null and b/quickstarts/cloud-guides/images/integrate-teradata-jupyter-extensions-with-sagemaker/sagemaker.notebook.notebook.inservice.png differ diff --git a/quickstarts/cloud-guides/images/integrate-teradata-jupyter-extensions-with-sagemaker/sagemaker.notebook.start.png b/quickstarts/cloud-guides/images/integrate-teradata-jupyter-extensions-with-sagemaker/sagemaker.notebook.start.png new file mode 100644 index 0000000000..f8d20945da Binary files /dev/null and b/quickstarts/cloud-guides/images/integrate-teradata-jupyter-extensions-with-sagemaker/sagemaker.notebook.start.png differ diff --git a/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image1.wmf b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image1.wmf new file mode 100644 index 0000000000..0fafe3580b Binary files /dev/null and b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image1.wmf differ diff --git a/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image10.png b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image10.png new file mode 100644 index 0000000000..9e146b8c0d Binary files /dev/null and b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image10.png differ diff --git a/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image11.png b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image11.png new file mode 100644 index 0000000000..c8e2bdb1b2 Binary files /dev/null and b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image11.png differ diff --git a/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image12.png b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image12.png new file mode 100644 index 0000000000..e10fc25eb6 Binary files /dev/null and b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image12.png differ diff --git a/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image13.png b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image13.png new file mode 100644 index 0000000000..6e50d0f5ec Binary files /dev/null and b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image13.png differ diff --git a/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image14.png b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image14.png new file mode 100644 index 0000000000..a7a0ddd648 Binary files /dev/null and b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image14.png differ diff --git a/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image15.png b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image15.png new file mode 100644 index 0000000000..3d24a0fa00 Binary files /dev/null and b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image15.png differ diff --git a/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image16.png b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image16.png new file mode 100644 index 0000000000..33c18a7268 Binary files /dev/null and b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image16.png differ diff --git a/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image17.png b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image17.png new file mode 100644 index 0000000000..f69ead40f5 Binary files /dev/null and b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image17.png differ diff --git a/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image18.png b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image18.png new file mode 100644 index 0000000000..2cd35c1806 Binary files /dev/null and b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image18.png differ diff --git a/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image19.png b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image19.png new file mode 100644 index 0000000000..a4f29b8ed8 Binary files /dev/null and b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image19.png differ diff --git a/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image2.png b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image2.png new file mode 100644 index 0000000000..2f16ec1f85 Binary files /dev/null and b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image2.png differ diff --git a/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image20.png b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image20.png new file mode 100644 index 0000000000..bdb1c7eda3 Binary files /dev/null and b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image20.png differ diff --git a/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image21.png b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image21.png new file mode 100644 index 0000000000..f279c7f2db Binary files /dev/null and b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image21.png differ diff --git a/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image22.png b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image22.png new file mode 100644 index 0000000000..d321e7aa52 Binary files /dev/null and b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image22.png differ diff --git a/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image23.png b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image23.png new file mode 100644 index 0000000000..f9e68e15e1 Binary files /dev/null and b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image23.png differ diff --git a/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image24.png b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image24.png new file mode 100644 index 0000000000..e4896b6f19 Binary files /dev/null and b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image24.png differ diff --git a/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image25.png b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image25.png new file mode 100644 index 0000000000..c643b2fa7f Binary files /dev/null and b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image25.png differ diff --git a/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image26.png b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image26.png new file mode 100644 index 0000000000..c970b75941 Binary files /dev/null and b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image26.png differ diff --git a/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image27.png b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image27.png new file mode 100644 index 0000000000..d24b122185 Binary files /dev/null and b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image27.png differ diff --git a/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image28.png b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image28.png new file mode 100644 index 0000000000..ea0ec6c1e9 Binary files /dev/null and b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image28.png differ diff --git a/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image29.png b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image29.png new file mode 100644 index 0000000000..55df058a9c Binary files /dev/null and b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image29.png differ diff --git a/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image3.png b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image3.png new file mode 100644 index 0000000000..e9e1522ef9 Binary files /dev/null and b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image3.png differ diff --git a/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image30.png b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image30.png new file mode 100644 index 0000000000..5bffd82b61 Binary files /dev/null and b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image30.png differ diff --git a/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image4.png b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image4.png new file mode 100644 index 0000000000..7c2a07601a Binary files /dev/null and b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image4.png differ diff --git a/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image41.png b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image41.png new file mode 100644 index 0000000000..3efd7ba2c6 Binary files /dev/null and b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image41.png differ diff --git a/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image42.png b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image42.png new file mode 100644 index 0000000000..331ab3a3c6 Binary files /dev/null and b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image42.png differ diff --git a/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image43.png b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image43.png new file mode 100644 index 0000000000..1e354d0a1e Binary files /dev/null and b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image43.png differ diff --git a/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image44.png b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image44.png new file mode 100644 index 0000000000..9245677522 Binary files /dev/null and b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image44.png differ diff --git a/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image45.png b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image45.png new file mode 100644 index 0000000000..6e85e73fb0 Binary files /dev/null and b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image45.png differ diff --git a/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image46.png b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image46.png new file mode 100644 index 0000000000..a7da8023f3 Binary files /dev/null and b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image46.png differ diff --git a/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image5.png b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image5.png new file mode 100644 index 0000000000..e25c3fd5d3 Binary files /dev/null and b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image5.png differ diff --git a/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image6.png b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image6.png new file mode 100644 index 0000000000..adc0e7e4c3 Binary files /dev/null and b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image6.png differ diff --git a/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image7.png b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image7.png new file mode 100644 index 0000000000..e8d7d24d0d Binary files /dev/null and b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image7.png differ diff --git a/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image8.png b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image8.png new file mode 100644 index 0000000000..0071e67b20 Binary files /dev/null and b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image8.png differ diff --git a/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image9.png b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image9.png new file mode 100644 index 0000000000..c4913ea3b7 Binary files /dev/null and b/quickstarts/cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image9.png differ diff --git a/quickstarts/cloud-guides/images/integrate-teradata-vantage-with-google-cloud-data-catalog/image1.wmf b/quickstarts/cloud-guides/images/integrate-teradata-vantage-with-google-cloud-data-catalog/image1.wmf new file mode 100644 index 0000000000..0fafe3580b Binary files /dev/null and b/quickstarts/cloud-guides/images/integrate-teradata-vantage-with-google-cloud-data-catalog/image1.wmf differ diff --git a/quickstarts/cloud-guides/images/integrate-teradata-vantage-with-google-cloud-data-catalog/image2.png b/quickstarts/cloud-guides/images/integrate-teradata-vantage-with-google-cloud-data-catalog/image2.png new file mode 100644 index 0000000000..41ea223fc4 Binary files /dev/null and b/quickstarts/cloud-guides/images/integrate-teradata-vantage-with-google-cloud-data-catalog/image2.png differ diff --git a/quickstarts/cloud-guides/images/integrate-teradata-vantage-with-google-cloud-data-catalog/image3.png b/quickstarts/cloud-guides/images/integrate-teradata-vantage-with-google-cloud-data-catalog/image3.png new file mode 100644 index 0000000000..e14b447e96 Binary files /dev/null and b/quickstarts/cloud-guides/images/integrate-teradata-vantage-with-google-cloud-data-catalog/image3.png differ diff --git a/quickstarts/cloud-guides/images/integrate-teradata-vantage-with-google-cloud-data-catalog/image4.png b/quickstarts/cloud-guides/images/integrate-teradata-vantage-with-google-cloud-data-catalog/image4.png new file mode 100644 index 0000000000..ddc007b464 Binary files /dev/null and b/quickstarts/cloud-guides/images/integrate-teradata-vantage-with-google-cloud-data-catalog/image4.png differ diff --git a/quickstarts/cloud-guides/images/integrate-teradata-vantage-with-google-cloud-data-catalog/image5.png b/quickstarts/cloud-guides/images/integrate-teradata-vantage-with-google-cloud-data-catalog/image5.png new file mode 100644 index 0000000000..a71549fc52 Binary files /dev/null and b/quickstarts/cloud-guides/images/integrate-teradata-vantage-with-google-cloud-data-catalog/image5.png differ diff --git a/quickstarts/cloud-guides/images/integrate-teradata-vantage-with-google-cloud-data-catalog/image6.png b/quickstarts/cloud-guides/images/integrate-teradata-vantage-with-google-cloud-data-catalog/image6.png new file mode 100644 index 0000000000..651e420a75 Binary files /dev/null and b/quickstarts/cloud-guides/images/integrate-teradata-vantage-with-google-cloud-data-catalog/image6.png differ diff --git a/quickstarts/cloud-guides/images/sagemaker-with-teradata-vantage/attach.endpoint.configuration.png b/quickstarts/cloud-guides/images/sagemaker-with-teradata-vantage/attach.endpoint.configuration.png new file mode 100644 index 0000000000..f58fb5a015 Binary files /dev/null and b/quickstarts/cloud-guides/images/sagemaker-with-teradata-vantage/attach.endpoint.configuration.png differ diff --git a/quickstarts/cloud-guides/images/sagemaker-with-teradata-vantage/choose.an.algorithm.png b/quickstarts/cloud-guides/images/sagemaker-with-teradata-vantage/choose.an.algorithm.png new file mode 100644 index 0000000000..6879f3a382 Binary files /dev/null and b/quickstarts/cloud-guides/images/sagemaker-with-teradata-vantage/choose.an.algorithm.png differ diff --git a/quickstarts/cloud-guides/images/sagemaker-with-teradata-vantage/container.definition.1.png b/quickstarts/cloud-guides/images/sagemaker-with-teradata-vantage/container.definition.1.png new file mode 100644 index 0000000000..ad95830a7a Binary files /dev/null and b/quickstarts/cloud-guides/images/sagemaker-with-teradata-vantage/container.definition.1.png differ diff --git a/quickstarts/cloud-guides/images/sagemaker-with-teradata-vantage/create.endpoint.configuration.png b/quickstarts/cloud-guides/images/sagemaker-with-teradata-vantage/create.endpoint.configuration.png new file mode 100644 index 0000000000..216dba588e Binary files /dev/null and b/quickstarts/cloud-guides/images/sagemaker-with-teradata-vantage/create.endpoint.configuration.png differ diff --git a/quickstarts/cloud-guides/images/sagemaker-with-teradata-vantage/create.endpoint.png b/quickstarts/cloud-guides/images/sagemaker-with-teradata-vantage/create.endpoint.png new file mode 100644 index 0000000000..29554f15a9 Binary files /dev/null and b/quickstarts/cloud-guides/images/sagemaker-with-teradata-vantage/create.endpoint.png differ diff --git a/quickstarts/cloud-guides/images/sagemaker-with-teradata-vantage/create.iam.role.png b/quickstarts/cloud-guides/images/sagemaker-with-teradata-vantage/create.iam.role.png new file mode 100644 index 0000000000..4b491c8986 Binary files /dev/null and b/quickstarts/cloud-guides/images/sagemaker-with-teradata-vantage/create.iam.role.png differ diff --git a/quickstarts/cloud-guides/images/sagemaker-with-teradata-vantage/create.notebook.png b/quickstarts/cloud-guides/images/sagemaker-with-teradata-vantage/create.notebook.png new file mode 100644 index 0000000000..342bfab499 Binary files /dev/null and b/quickstarts/cloud-guides/images/sagemaker-with-teradata-vantage/create.notebook.png differ diff --git a/quickstarts/cloud-guides/images/sagemaker-with-teradata-vantage/create.training.job.png b/quickstarts/cloud-guides/images/sagemaker-with-teradata-vantage/create.training.job.png new file mode 100644 index 0000000000..6bf7c467dc Binary files /dev/null and b/quickstarts/cloud-guides/images/sagemaker-with-teradata-vantage/create.training.job.png differ diff --git a/quickstarts/cloud-guides/images/sagemaker-with-teradata-vantage/input.data.configuration.png b/quickstarts/cloud-guides/images/sagemaker-with-teradata-vantage/input.data.configuration.png new file mode 100644 index 0000000000..0b00b53fcc Binary files /dev/null and b/quickstarts/cloud-guides/images/sagemaker-with-teradata-vantage/input.data.configuration.png differ diff --git a/quickstarts/cloud-guides/images/sagemaker-with-teradata-vantage/open.notebook.instance.png b/quickstarts/cloud-guides/images/sagemaker-with-teradata-vantage/open.notebook.instance.png new file mode 100644 index 0000000000..1290dd2ff4 Binary files /dev/null and b/quickstarts/cloud-guides/images/sagemaker-with-teradata-vantage/open.notebook.instance.png differ diff --git a/quickstarts/cloud-guides/images/sagemaker-with-teradata-vantage/output.data.configuration.png b/quickstarts/cloud-guides/images/sagemaker-with-teradata-vantage/output.data.configuration.png new file mode 100644 index 0000000000..b81f35193e Binary files /dev/null and b/quickstarts/cloud-guides/images/sagemaker-with-teradata-vantage/output.data.configuration.png differ diff --git a/quickstarts/cloud-guides/images/sagemaker-with-teradata-vantage/resource.configuration.png b/quickstarts/cloud-guides/images/sagemaker-with-teradata-vantage/resource.configuration.png new file mode 100644 index 0000000000..37c7b1c9a6 Binary files /dev/null and b/quickstarts/cloud-guides/images/sagemaker-with-teradata-vantage/resource.configuration.png differ diff --git a/quickstarts/cloud-guides/images/sagemaker-with-teradata-vantage/select.endpoint.configuration.png b/quickstarts/cloud-guides/images/sagemaker-with-teradata-vantage/select.endpoint.configuration.png new file mode 100644 index 0000000000..efcf6d65b3 Binary files /dev/null and b/quickstarts/cloud-guides/images/sagemaker-with-teradata-vantage/select.endpoint.configuration.png differ diff --git a/quickstarts/cloud-guides/images/sagemaker-with-teradata-vantage/start.new.file.png b/quickstarts/cloud-guides/images/sagemaker-with-teradata-vantage/start.new.file.png new file mode 100644 index 0000000000..09b9e83640 Binary files /dev/null and b/quickstarts/cloud-guides/images/sagemaker-with-teradata-vantage/start.new.file.png differ diff --git a/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image1.wmf b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image1.wmf new file mode 100644 index 0000000000..0fafe3580b Binary files /dev/null and b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image1.wmf differ diff --git a/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image10.png b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image10.png new file mode 100644 index 0000000000..32d98c19d8 Binary files /dev/null and b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image10.png differ diff --git a/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image11.png b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image11.png new file mode 100644 index 0000000000..a546f9d233 Binary files /dev/null and b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image11.png differ diff --git a/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image12.png b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image12.png new file mode 100644 index 0000000000..1972489bdc Binary files /dev/null and b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image12.png differ diff --git a/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image13.png b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image13.png new file mode 100644 index 0000000000..139f569b4a Binary files /dev/null and b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image13.png differ diff --git a/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image14.png b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image14.png new file mode 100644 index 0000000000..b6f86f44bf Binary files /dev/null and b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image14.png differ diff --git a/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image15.png b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image15.png new file mode 100644 index 0000000000..167170001e Binary files /dev/null and b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image15.png differ diff --git a/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image16.png b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image16.png new file mode 100644 index 0000000000..6846ca85cb Binary files /dev/null and b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image16.png differ diff --git a/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image17.png b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image17.png new file mode 100644 index 0000000000..3488786a4b Binary files /dev/null and b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image17.png differ diff --git a/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image18.png b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image18.png new file mode 100644 index 0000000000..40ab58077d Binary files /dev/null and b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image18.png differ diff --git a/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image19.png b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image19.png new file mode 100644 index 0000000000..2a8900c079 Binary files /dev/null and b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image19.png differ diff --git a/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image2.png b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image2.png new file mode 100644 index 0000000000..ac948cdac0 Binary files /dev/null and b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image2.png differ diff --git a/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image20.png b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image20.png new file mode 100644 index 0000000000..e584a5f275 Binary files /dev/null and b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image20.png differ diff --git a/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image21.png b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image21.png new file mode 100644 index 0000000000..e30f97529d Binary files /dev/null and b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image21.png differ diff --git a/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image22.png b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image22.png new file mode 100644 index 0000000000..218ed09771 Binary files /dev/null and b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image22.png differ diff --git a/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image23.png b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image23.png new file mode 100644 index 0000000000..a6c560757c Binary files /dev/null and b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image23.png differ diff --git a/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image24.png b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image24.png new file mode 100644 index 0000000000..1ed1a8e525 Binary files /dev/null and b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image24.png differ diff --git a/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image25.png b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image25.png new file mode 100644 index 0000000000..829e6a76f8 Binary files /dev/null and b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image25.png differ diff --git a/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image26.png b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image26.png new file mode 100644 index 0000000000..d75e9e67f5 Binary files /dev/null and b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image26.png differ diff --git a/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image27.png b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image27.png new file mode 100644 index 0000000000..cc6af35b9c Binary files /dev/null and b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image27.png differ diff --git a/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image28.png b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image28.png new file mode 100644 index 0000000000..6813315bb7 Binary files /dev/null and b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image28.png differ diff --git a/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image3.png b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image3.png new file mode 100644 index 0000000000..26e835ecc1 Binary files /dev/null and b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image3.png differ diff --git a/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image4.png b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image4.png new file mode 100644 index 0000000000..ac3cc6c8d2 Binary files /dev/null and b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image4.png differ diff --git a/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image5.png b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image5.png new file mode 100644 index 0000000000..038549ecbe Binary files /dev/null and b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image5.png differ diff --git a/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image6.png b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image6.png new file mode 100644 index 0000000000..99c3c2b7a8 Binary files /dev/null and b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image6.png differ diff --git a/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image7.png b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image7.png new file mode 100644 index 0000000000..7deb2e1210 Binary files /dev/null and b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image7.png differ diff --git a/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image8.png b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image8.png new file mode 100644 index 0000000000..c8386281d1 Binary files /dev/null and b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image8.png differ diff --git a/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image9.png b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image9.png new file mode 100644 index 0000000000..a16c7ec23f Binary files /dev/null and b/quickstarts/cloud-guides/images/use-teradata-vantage-with-azure-machine-learning-studio/image9.png differ diff --git a/quickstarts/connect-to-vantage/_category_.json b/quickstarts/connect-to-vantage/_category_.json new file mode 100644 index 0000000000..6b3acb9386 --- /dev/null +++ b/quickstarts/connect-to-vantage/_category_.json @@ -0,0 +1,4 @@ +{ + "label": "Connect to Vantage", + "position": 3 + } \ No newline at end of file diff --git a/quickstarts/connect-to-vantage/configure-a-teradata-vantage-connection-in-dbeaver.md b/quickstarts/connect-to-vantage/configure-a-teradata-vantage-connection-in-dbeaver.md new file mode 100644 index 0000000000..e23fd97353 --- /dev/null +++ b/quickstarts/connect-to-vantage/configure-a-teradata-vantage-connection-in-dbeaver.md @@ -0,0 +1,71 @@ +--- +sidebar_position: 2 +author: Adam Tworkiewicz +email: adam.tworkiewicz@teradata.com +page_last_update: March 6th, 2022 +description: Configure a Teradata Vantage connection in DBeaver. +keywords: [data warehouses, compute storage separation, teradata, vantage, cloud data platform, object storage, business intelligence, enterprise analytics, dbeaver, dbeaver prod, sql ide] +--- + +# Configure a Teradata Vantage connection in DBeaver + +## Overview + +This how-to demonstrates how to create a connection to Teradata Vantage with DBeaver. + +## Prerequisites + +import ClearscapeDocsNote from '../_partials/vantage_clearscape_analytics.mdx' + +* Access to a Teradata Vantage instance. + +* DBeaver installed. See [DBeaver Community](https://dbeaver.io/download) or [DBeaver PRO](https://dbeaver.com/download) for installation options. + +## Add a Teradata connection to DBeaver + +1. Start the new connection wizard by clicking on the plug icon (![Add Connection Plug Icon](../other-integrations/images/configure-a-teradata-connection-in-dbeaver/plug-icon.png)) in the upper left corner of the application window or go to `Database -> New Database Connection`. +2. On `Select your database` screen, start typing `teradata` and select the Teradata icon. +![Select your database](../other-integrations/images/configure-a-teradata-connection-in-dbeaver/select-your-database-windows.png)] +3. On the main tab, you need to set all primary connection settings. The required ones include `Host`, `Port`, `Database`, `Username`, and `Password`. +:::tip +In Teradata Vantage, when a user is created a corresponding database with the same is created as well. DBeaver requires that you enter the database. If you don't know what database you want to connect to, use your username in the `database` field. +::: +:::tip +With DBeaver PRO, you can not only use the standard ordering of tables but also hierarchically link tables to a specific database or user. Expanding and collapsing the databases or users will help you navigate from one area to another without swamping the Database Navigator window. Check the `Show databases and users hierarchically` box to enable this setting. +::: +:::tip +In many environments Teradata Vantage can only be accessed using the TLS protocol. When in DBeaver PRO, check `Use TLS protocol` option to enable TLS. +::: +![Teradata connection settings](../other-integrations/images/configure-a-teradata-connection-in-dbeaver/teradata-connection-settings-windows.png) +4. Click on `Finish`. + +## Optional: Logon Mechanisms + +The default logon mechanism when creating a DBeaver connection is TD2. To add other logon mechanisms, follow the steps below: + +1. Navigate to the database menu and click on Driver Manager. +2. From the list of driver names, select Teradata and click "Copy". +![Copy the Teradata driver](../other-integrations/images/configure-a-teradata-connection-in-dbeaver/copy-driver.png) +3. In the "URL Template" field, define your selected logon mechanism. ++ +`jdbc:teradata://\{host}/LOGMECH=LDAP,DATABASE=\{database},DBS_PORT=\{port}` +![Configure connection string](../other-integrations/images/configure-a-teradata-connection-in-dbeaver/configure-driver-string.png) +4. Click "OK". +5. The new driver is now available to create connections with the selected logon mechanism. +![Create a connection](../other-integrations/images/configure-a-teradata-connection-in-dbeaver/create-connection.png) +6. The process for setting up a new connection with the alternative mechanism is the same as described above for adding a new connection. +![Configure connection](../other-integrations/images/configure-a-teradata-connection-in-dbeaver/teradata-connection-settings-windows-ldap.png) + +## Optional: SSH tunneling + +If your database cannot be accessed directly, you can use an SSH tunnel. All settings are available on the `SSH` tab. DBeaver supports the following authentication methods: user/password, public key, SSH agent authentication. + +![Teradata connection settings SSH](../other-integrations/images/configure-a-teradata-connection-in-dbeaver/teradata-connection-settings-ssh-windows.png) + +## Summary + +This how-to demonstrated how to create a connection to Teradata Vantage with DBeaver. + +import CommunityLinkPartial from '../_partials/community_link.mdx'; + + diff --git a/quickstarts/connect-to-vantage/configure-odbc/_category_.json b/quickstarts/connect-to-vantage/configure-odbc/_category_.json new file mode 100644 index 0000000000..651d6bba95 --- /dev/null +++ b/quickstarts/connect-to-vantage/configure-odbc/_category_.json @@ -0,0 +1,4 @@ +{ + "label": "Configure ODBC", + "position": 2 + } \ No newline at end of file diff --git a/quickstarts/connect-to-vantage/configure-odbc/odbc.ubuntu.md b/quickstarts/connect-to-vantage/configure-odbc/odbc.ubuntu.md new file mode 100644 index 0000000000..36360462a4 --- /dev/null +++ b/quickstarts/connect-to-vantage/configure-odbc/odbc.ubuntu.md @@ -0,0 +1,88 @@ +--- +id: ubuntu +sidebar_position: 1 +author: Adam Tworkiewicz +email: adam.tworkiewicz@teradata.com +page_last_update: January 5th, 2022 +description: Use Vantage with ODBC on Ubuntu +keywords: [data warehouses, compute storage separation, teradata, vantage, cloud data platform, object storage, business intelligence, enterprise analytics, odbc, ubuntu] +--- + +import ClearscapeDocsNote from '../../_partials/vantage_clearscape_analytics.mdx' +import CommunityLink from '../../_partials/community_link.mdx' + +# Use Vantage with ODBC on Ubuntu + +## Overview + +This how-to demonstrates how to use the ODBC driver with Teradata Vantage on Ubuntu. + +## Prerequisites + +* Access to a Teradata Vantage instance. + +## Installation + +* Install dependencies: + +``` +apt update && DEBIAN_FRONTEND=noninteractive apt install -y wget unixodbc unixodbc-dev iodbc python3-pip +``` + +* Install Teradata ODBC driver for Ubuntu: +``` +wget https://downloads.teradata.com/download/cdn/connectivity/odbc/17.10.x.x/tdodbc1710__ubuntu_x8664.17.10.00.14-1.tar.gz \ + && tar -xzf tdodbc1710__ubuntu_x8664.17.10.00.14-1.tar.gz \ + && dpkg -i tdodbc1710/tdodbc1710-17.10.00.14-1.x86_64.deb +``` + +* Configure ODBC, by creating file `/etc/odbcinst.ini` with the following content: +``` +[ODBC Drivers] +Teradata Database ODBC Driver 17.10=Installed + +[Teradata Database ODBC Driver 17.10] +Description=Teradata Database ODBC Driver 17.10 +Driver=/opt/teradata/client/17.10/odbc_64/lib/tdataodbc_sb64.so +``` + +## Use ODBC + +We will validate the installation with a sample Python application. Create `test.py` file with the following content. +Replace `DBCName=192.168.86.33;UID=dbc;PWD=dbc` with the IP address of your Teradata Vantage instance, username and password: + +``` +import pyodbc + +print(pyodbc.drivers()) + +cnxn = pyodbc.connect('DRIVER={Teradata Database ODBC Driver 17.10};DBCName=192.168.86.33;UID=dbc;PWD=dbc;') +cursor = cnxn.cursor() + +cursor.execute("SELECT CURRENT_DATE") +for row in cursor.fetchall(): + print(row) +EOF +``` + +Run the test application: + +``` +python3 test.py +``` + +You should get output similar to: + +``` +['ODBC Drivers', 'Teradata Database ODBC Driver 17.10'] +(datetime.date(2022, 1, 5), ) +``` + +## Summary + +This how-to demonstrated how to use ODBC with Teradata Vantage on Ubuntu. The how-to shows how to install the ODBC Teradata driver and the dependencies. It then shows how to configure ODBC and validate connectivity with a simple Python application. + +## Further reading +* [ODBC Driver for Teradata® User Guide](https://docs.teradata.com/search/all?query=ODBC+Driver+for+Teradata+User+Guide&filters=ft%3AisBook~%22true%22&sort=last_update) + + \ No newline at end of file diff --git a/quickstarts/connect-to-vantage/install-teradata-studio-on-mac-m1-m2.md b/quickstarts/connect-to-vantage/install-teradata-studio-on-mac-m1-m2.md new file mode 100644 index 0000000000..7694df011f --- /dev/null +++ b/quickstarts/connect-to-vantage/install-teradata-studio-on-mac-m1-m2.md @@ -0,0 +1,33 @@ +--- +id: install-teradata-studio-on-mac-m1-m2 +sidebar_position: 1 + +author: Satish Chinthanippu +email: satish.chinthanippu@teradata.com +page_last_update: August 14th, 2023 +description: Run Teradata Studio on Apple Mac M1/M2. +keywords: [Teradata Studio, Teradata Studio Express, teradata, vantage, Mac, Apple Mac, Apple Mac M1,Apple Mac M2, Arm based Processor.] +--- + +import CommunityLink from '../_partials/community_link.mdx'; + +# Use Teradata Studio/Express on Apple Mac M1/M2 + +## Overview + +This how-to goes through the installation of Teradata Studio and Teradata Studio Express on Apple Mac M1/M2 machines. + +## Steps to follow + +1. Install and enable Rosetta binary translator. Follow [the Apple Mac Rosetta Installation Guide](https://support.apple.com/en-us/HT211861). +2. Download and Install a x86 64-bit based JDK 11 from your preferred vendor. For example, you can download x86 64-bit JDK 11 from [Azul](https://www.azul.com/downloads/?version=java-11-lts&os=macos&architecture=x86-64-bit&package=jdkGet) +3. Download the latest Teradata Studio or Teradata Studio Express release from the Teradata Downloads page: +* [Teradata Studio](https://downloads.teradata.com/download/tools/teradata-studio) +* [Teradata Studio Express](https://downloads.teradata.com/download/tools/teradata-studio-express) +4. Install the Teradata Studio/Teradata Studio Express. Refer to [Teradata Studio and Teradata Studio Express Installation Guide]({attachmentsdir}/Studio-Express-InstallGuide.pdf) for details. + +## Summary + +Apple has introduced ARM-based processors in Apple MAC M1/M2 machines. Intel x64-based applications won't work by default on ARM-based processors. Teradata Studio or Teradata Studio Express also doesn't work by default as the current Studio macOS build is an intel x64-based application. This how-to demonstrates how to install Intel x64-based JDK and Teradata Studio or Teradata Studio Express on Apple Mac M1/M2. + + \ No newline at end of file diff --git a/quickstarts/create-applications/_category_.json b/quickstarts/create-applications/_category_.json new file mode 100644 index 0000000000..43a055d1cf --- /dev/null +++ b/quickstarts/create-applications/_category_.json @@ -0,0 +1,4 @@ +{ + "label": "Create applications", + "position": 5 + } \ No newline at end of file diff --git a/quickstarts/create-applications/jdbc.md b/quickstarts/create-applications/jdbc.md new file mode 100644 index 0000000000..791d2ad161 --- /dev/null +++ b/quickstarts/create-applications/jdbc.md @@ -0,0 +1,117 @@ +--- +sidebar_position: 1 +id: jdbc +author: Adam Tworkiewicz +email: adam.tworkiewicz@teradata.com +page_last_update: November 14th, 2022 +description: How to connect to Teradata Vantage using JDBC. +keywords: [data warehouses, compute storage separation, teradata, vantage, cloud data platform, JDBC, java applications, business intelligence, enterprise analytics] +--- + +import ClearscapeDocsNote from '../_partials/vantage_clearscape_analytics.mdx' +import CommunityLink from '../_partials/community_link.mdx' + +# Connect to Vantage using JDBC + +## Overview + +This how-to demonstrates how to connect to Teradata Vantage using JDBC using a sample Java application: [GitHub JDBC](https://github.com/Teradata/jdbc-sample-app). + +## Prerequisites + +* Access to a Teradata Vantage instance. + + + +* JDK +* Maven + +## Add dependency to your maven project + +Add the Teradata JDBC driver as a dependency to your Maven POM XML file: + + +```xml + + com.teradata.jdbc + terajdbc + 20.00.00.06 + +``` +## Code to send a query + +:::note +This step assumes that your Vantage database is available on `localhost` on port `1025`. If you are running Vantage Express on your laptop, you need to expose the port from the VM to the host machine. Refer to your virtualization software documentation how to forward ports. +::: + +The project is set up. All that is left, is to load the driver, pass connection and authentication parameters and run a query: + +```java +package com.teradata.app; + +import java.sql.*; + +public class App { + static final String DB_URL = "jdbc:teradata://localhost"; + static final String USER = "dbc"; + static final String PASS = "dbc"; + static final String QUERY = "SELECT * FROM dbc.dbcinfo"; + + public static void main(String[] args) { + App app = new App(); + app.query(); + } + + public void query() { + Connection conn = null; + Statement stmt = null; + ResultSet rs = null; + + // Open a connection + try { + conn = DriverManager.getConnection(DB_URL, USER, PASS); + stmt = conn.createStatement(); + rs = stmt.executeQuery(QUERY); + + // Extract data from result set + while (rs.next()) { + System.out.println(String.format("setting: %s, value: %s", rs.getString(1), rs.getString(2))); + } + } catch (SQLException e) { + e.printStackTrace(); + } finally { + if (rs != null) { + try { + rs.close(); + } catch (SQLException e) { /* Ignored */} + } + if (stmt != null) { + try { + stmt.close(); + } catch (SQLException e) { /* Ignored */} + } + if (conn != null) { + try { + conn.close(); + } catch (SQLException e) { /* Ignored */} + } + } + } +} +``` + +### Run the tests + +Run the tests: +``` +mvn test +``` + +## Summary + +This how-to demonstrated how to connect to Teradata Vantage using JDBC. It described a sample Java application with Maven as the build tool that uses the Teradata JDBC driver to send SQL queries to Teradata Vantage. + +## Further reading +* [Teradata JDBC Driver Reference](https://teradata-docs.s3.amazonaws.com/doc/connectivity/jdbc/reference/current/frameset.html) + + \ No newline at end of file diff --git a/quickstarts/create-applications/mule-dbc-example.md b/quickstarts/create-applications/mule-dbc-example.md new file mode 100644 index 0000000000..bc9025bb90 --- /dev/null +++ b/quickstarts/create-applications/mule-dbc-example.md @@ -0,0 +1,141 @@ +--- +sidebar_position: 3 +id: mule-dbc-example +author: Adam Tworkiewicz +email: adam.tworkiewicz@teradata.com +page_last_update: August 30, 2023 +description: Query Teradata Vantage from a Mule service. +keywords: [data warehouses, compute storage separation, teradata, vantage, cloud data platform, object storage, business intelligence, enterprise analytics, Mule, JDBC, microservices] +--- + +import ClearscapeDocsNote from '../_partials/vantage_clearscape_analytics.mdx' +import CommunityLink from '../_partials/community_link.mdx' + +# Query Teradata Vantage from a Mule service + +## Overview + +This example is a clone of the Mulesoft MySQL sample project. +It demonstrates how to query a Teradata database and expose results over REST API. + +## Prerequisites + +* Mulesoft Anypoint Studio. You can download a 30-day trial from https://www.mulesoft.com/platform/studio. +* Access to a Teradata Vantage instance. + + + +## Example service + +This example Mule service takes an HTTP request, queries the Teradata Vantage database and returns results in JSON format. + +![service flow](../images/flow.png) + +The Mule HTTP connector listens for HTTP GET requests with the form: `http://:8081/?lastname=`. +The HTTP connector passes the value of `` as one of the message properties to a database connector. +The database connector is configured to extract this value and use it in this SQL query: + +``` +SELECT * FROM hr.employees WHERE LastName = :lastName +``` + +As you can see, we are using parameterized query with reference to the value of the parameter passed to the HTTP connector. +So if the HTTP connector receives http://localhost:8081/?lastname=Smith, the SQL query will be: + +``` +SELECT * FROM employees WHERE last_name = Smith +``` + +The database connector instructs the database server to run the SQL query, retrieves the result of the query, and passes it to the Transform message processor which converts the result to JSON. +Since the HTTP connector is configured as request-response, the result is returned to the originating HTTP client. + +## Setup + +* Clone `Teradata/mule-jdbc-example` repository: +``` + git clone https://github.com/Teradata/mule-jdbc-example +``` + +* Edit `src/main/mule/querying-a-teradata-database.xml`, find the Teradata connection string `jdbc:teradata:///user=,password=` and replace Teradata connection parameters to match your environment. + +:::note +Should your Vantage instance be accessible via ClearScape Analytics Experience, you must replace `` with the host URL of your ClearScape Analytics Experience environment. Additionally, the 'user' and 'password' should be updated to reflect your ClearScape Analytics Environment's username and password. +::: + +* Create a sample database in your Vantage instance. +Populate it with sample data. + +```sql + -- create database + CREATE DATABASE HR + AS PERMANENT = 60e6, SPOOL = 120e6; + + -- create table + CREATE SET TABLE HR.Employees ( + GlobalID INTEGER, + FirstName VARCHAR(30), + LastName VARCHAR(30), + DateOfBirth DATE FORMAT 'YYYY-MM-DD', + JoinedDate DATE FORMAT 'YYYY-MM-DD', + DepartmentCode BYTEINT + ) + UNIQUE PRIMARY INDEX ( GlobalID ); + + -- insert a record + INSERT INTO HR.Employees ( + GlobalID, + FirstName, + LastName, + DateOfBirth, + JoinedDate, + DepartmentCode + ) VALUES ( + 101, + 'Test', + 'Testowsky', + '1980-01-05', + '2004-08-01', + 01 + ); +``` + +* Open the project in Anypoint Studio. + * Once in Anypoint Studio, click on `Import projects..`: + + ![Anypoint import projects menu](../images/anypoint.import.projects.png) + + * Select `Anypoint Studio project from File System`: + + ![Anypoint import option](../images/select.import.option.png) + + * Use the directory where you cloned the git repository as the `Project Root`. Leave all other settings at their default values. + +## Run + +* Run the example application in Anypoint Studio using the `Run` menu. +The project will now build and run. It will take a minute. +* Go to your web browser and send the following request: http://localhost:8081/?lastname=Testowsky. + +You should get the following JSON response: + + +``` +[ + { + "JoinedDate": "2004-08-01T00:00:00", + "DateOfBirth": "1980-01-05T00:00:00", + "FirstName": "Test", + "GlobalID": 101, + "DepartmentCode": 1, + "LastName": "Testowsky" + } +] +``` + +## Further reading + +* View this [document](http://www.mulesoft.org/documentation/display/current/Database+Connector) for more information on how to configure a database connector on your machine. +* Access plain [Reference material](http://www.mulesoft.org/documentation/display/current/Database+Connector+Reference) for the Database Connector. +* Learn more about [DataSense](http://www.mulesoft.org/documentation/display/current/DataSense). + + \ No newline at end of file diff --git a/quickstarts/create-applications/send-queries-using-rest-api.md b/quickstarts/create-applications/send-queries-using-rest-api.md new file mode 100644 index 0000000000..3dc7395b30 --- /dev/null +++ b/quickstarts/create-applications/send-queries-using-rest-api.md @@ -0,0 +1,603 @@ +--- +sidebar_position: 4 +author: Sudha Vedula +email: sudha.vedula@teradata.com +page_last_update: May 29th, 2023 +description: Send queries using REST API. Teradata® Query Service is a middleware that provides REST APIs for Vantage. +keywords: [query service, teradata, vantage, query, REST API] +--- + +# Send queries using REST API + +## Overview + +Teradata Query Service is a REST API for Vantage that you can use to run standard SQL statements without managing client-side drivers. Use Query Service if you are looking to query and access the Analytics Database through a REST API. + +This how-to provides examples of common use cases to help you get started with Query Service API. + +## Prerequisites + +Before starting, make sure you have: + +import ClearscapeDocsNote from '../_partials/vantage_clearscape_analytics.mdx' + +* Access to a VantageCloud system where Query Service is provisioned, or a VantageCore with Query Service enabled connectivity. If you are an admin and need to install Query Service, see [Query Service Installation, Configuration, and Usage Guide](https://docs.teradata.com/r/Teradata-Query-Service-Installation-Configuration-and-Usage-Guide-for-Customers/April-2022). + +* Query Service hostname and system name +* Authorization credentials to connect to the database + +Having trouble with the prerequisites? Contact Teradata for setup information. + +## Query Service API examples + +When using the examples, please keep in mind that: + +* The examples in this document use Python, and you can use these to create examples in your language of choice. +* The examples provided here are complete and ready for you to use, although most require a little customization. + * The examples in this document use the URL `https://:1443/`. + * Replace the following variables with your own value: + * ``: Server where Query Service is installed + * ``: Preconfigured alias of the system + +:::note +If your Vantage instance is provided through ClearScape Analytics Experience,``, is the host URL of your ClearScape Analytics Experience environment, `` is 'local'. +::: + + +## Connect to your Query Service instance + +Provide valid credentials to access the target Analytics Database using HTTP Basic or JWT authentication. + +### HTTP Basic authentication + +The database username and password are combined into a string (`"username : password"`) which is then encoded using Base64. The API response contains the authorization method and encoded credentials. + +Request + +``` python , id="queryservice_first_query", role="emits-gtm-events" +import requests +import json +import base64 +requests.packages.urllib3.disable_warnings() + +# run it from local. + +db_user, db_password = 'dbc','dbc' +auth_encoded = db_user + ':' + db_password +auth_encoded = base64.b64encode(bytes(auth_encoded, 'utf-8')) +auth_str = 'Basic ' + auth_encoded.decode('utf-8') + +print(auth_str) + +headers = { + 'Content-Type': 'application/json', + 'Authorization': auth_str # base 64 encoded username:password +} + +print(headers) +``` + +Response + +``` +Basic ZGJjOmRiYw== +{ + 'Content-Type': 'application/json', + 'Authorization': 'Basic ZGJjOmRiYw==' +} +``` + +### JWT authentication + +Prerequisites: + +* The user must already exist in the database. + +* The database must be JWT enabled. + +Request + +``` python +import requests +import json +requests.packages.urllib3.disable_warnings() + +# run it from local. + +auth_encoded_jwt = "" +auth_str = "Bearer " + auth_encoded_jwt + +headers = { + 'Content-Type': 'application/json', + 'Authorization': auth_str +} + +print(headers) +``` + +Response + +``` +{'Content-Type': 'application/json', 'Authorization': 'Bearer '} +``` + +## Make a simple API request with basic options + +In the following example, the request includes: + +* `SELECT * FROM DBC.DBCInfo`: The query to the system with the alias ``. +* `'format': 'OBJECT'`: The format for response. The formats supported are: JSON object, JSON array, and CSV. + +:::note +The JSON object format creates one JSON object per row where the column name is the field name, and the column value is the field value. +::: + +* `'includeColumns': true`: The request to include column metadata, such as column names and types, in the response. +* `'rowLimit': 4`: The number of rows to be returned from a query. + +Request + +``` python +url = 'https://:1443/systems//queries' + +payload = { + 'query': example_query, # 'SELECT * FROM DBC.DBCInfo;', + 'format': 'OBJECT', + 'includeColumns': True, + 'rowLimit': 4 +} + +payload_json = json.dumps(payload) + +response = requests.request('POST', url, headers=headers, data=payload_json, verify=False) + +num_rows = response.json().get('results'](0].get('rowCount') +print('NUMBER of ROWS', num_rows) +print('==========================================================') + +print(response.json()) +``` + +Response + +``` json +NUMBER of ROWS 4 +========================================================== +{ + "queueDuration":7, + "queryDuration":227, + "results":[ + { + "resultSet":True, + "columns":[ + { + "name":"DatabaseName", + "type":"CHAR" + }, + { + "name":"USEDSPACE_IN_GB", + "type":"FLOAT" + }, + { + "name":"MAXSPACE_IN_GB", + "type":"FLOAT" + }, + { + "name":"Percentage_Used", + "type":"FLOAT" + }, + { + "name":"REMAININGSPACE_IN_GB", + "type":"FLOAT" + } + ], + "data":[ + { + "DatabaseName":"DBC", + "USEDSPACE_IN_GB":317.76382541656494, + "MAXSPACE_IN_GB":1510.521079641879, + "Percentage_Used":21.03670247964377, + "REMAININGSPACE_IN_GB":1192.757254225314 + }, + { + "DatabaseName":"EM", + "USEDSPACE_IN_GB":0.0007491111755371094, + "MAXSPACE_IN_GB":11.546071618795395, + "Percentage_Used":0.006488017745513208, + "REMAININGSPACE_IN_GB":11.545322507619858 + }, + { + "DatabaseName":"user10", + "USEDSPACE_IN_GB":0.019153594970703125, + "MAXSPACE_IN_GB":9.313225746154785, + "Percentage_Used":0.20566016, + "REMAININGSPACE_IN_GB":9.294072151184082 + }, + { + "DatabaseName":"EMEM", + "USEDSPACE_IN_GB":0.006140708923339844, + "MAXSPACE_IN_GB":4.656612873077393, + "Percentage_Used":0.13187072, + "REMAININGSPACE_IN_GB":4.650472164154053 + }, + { + "DatabaseName":"EMWork", + "USEDSPACE_IN_GB":0.0, + "MAXSPACE_IN_GB":4.656612873077393, + "Percentage_Used":0.0, + "REMAININGSPACE_IN_GB":4.656612873077393 + } + ], + "rowCount":4, + "rowLimitExceeded":True + } + ] +} +``` + +For response parameters, see [Query Service Installation, Configuration, and Usage Guide](https://docs.teradata.com/r/Teradata-Query-Service-Installation-Configuration-and-Usage-Guide-for-Customers/April-2022/Using-the-Query-Service-APIs/Submitting-SQL-Statement/Request-Body). + + +### Request a response in CSV format + +To return an API response in CSV format, set the `*format*` field in the request with the value `*CSV*`. + +The CSV format contains only the query results and not response metadata. The response contains a line for each row, where each line contains the row columns separated by a comma. The following example returns the data as comma-separated values. + +Request + +``` python +# CSV with all rows included + +url = 'https://:1443/systems//queries' + +payload = { + 'query': example_query, # 'SELECT * FROM DBC.DBCInfo;', + 'format': 'CSV', + 'includeColumns': True +} + +payload_json = json.dumps(payload) + +response = requests.request('POST', url, headers=headers, data=payload_json, verify=False) + +print(response.text) +``` + +Response + +``` +DatabaseName,USEDSPACE_IN_GB,MAXSPACE_IN_GB,Percentage_Used,REMAININGSPACE_IN_GB +DBC ,317.7634754180908,1510.521079641879,21.036679308932754,1192.7576042237881 +EM ,7.491111755371094E-4,11.546071618795395,0.006488017745513208,11.545322507619858 +user10 ,0.019153594970703125,9.313225746154785,0.20566016,9.294072151184082 +EMEM ,0.006140708923339844,4.656612873077393,0.13187072,4.650472164154053 +EMWork ,0.0,4.656612873077393,0.0,4.656612873077393 +EMJI ,0.0,2.3283064365386963,0.0,2.3283064365386963 +USER_NAME ,0.0,2.0,0.0,2.0 +readonly ,0.0,0.9313225746154785,0.0,0.9313225746154785 +aug12_db ,7.200241088867188E-5,0.9313225746154785,0.0077312,0.9312505722045898 +SystemFe ,1.8024444580078125E-4,0.7450580596923828,0.024192,0.744877815246582 +dbcmngr ,3.814697265625E-6,0.09313225746154785,0.004096,0.09312844276428223 +EMViews ,0.027594566345214844,0.09313225746154785,29.62944,0.06553769111633301 +tdwm ,6.732940673828125E-4,0.09313225746154785,0.722944,0.09245896339416504 +Crashdumps ,0.0,0.06984921544790268,0.0,0.06984921544790268 +SYSLIB ,0.006252288818359375,0.03725290298461914,16.78336,0.031000614166259766 +SYSBAR ,4.76837158203125E-6,0.03725290298461914,0.0128,0.03724813461303711 +SYSUDTLIB ,3.5381317138671875E-4,0.029802322387695312,1.1872,0.029448509216308594 +External_AP ,0.0,0.01862645149230957,0.0,0.01862645149230957 +SysAdmin ,0.002307891845703125,0.01862645149230957,12.3904,0.016318559646606445 +KZXaDtQp ,0.0,0.009313225746154785,0.0,0.009313225746154785 +s476QJ6O ,0.0,0.009313225746154785,0.0,0.009313225746154785 +hTzz03i7 ,0.0,0.009313225746154785,0.0,0.009313225746154785 +Y5WYUUXj ,0.0,0.009313225746154785,0.0,0.009313225746154785 +``` + +## Use explicit session to submit a query + +Use explicit sessions when a transaction needs to span multiple requests or when using volatile tables. These sessions are only reused if you reference the sessions in a query request. The request is queued if a request references an explicit session already in use. + +1. Create a session + +Send a POST request to the `/system//sessions` endpoint. The request creates a new database session and returns the session details as the response. + +In the following example, the request includes `'auto_commit': True` - the request to commit the query upon completion. + +Request + +``` python +# first create a session +url = 'https://:1443/systems//sessions' + +payload = { + 'auto_commit': True +} + +payload_json = json.dumps(payload) + +response = requests.request('POST', url, headers=headers, data=payload_json, verify=False) + +print(response.text) +``` + +Response + +``` +{ + 'sessionId': 1366010, + 'system': 'testsystem', + 'user': 'dbc', + 'tdSessionNo': 1626922, + 'createMode': 'EXPLICIT', + 'state': 'LOGGINGON', + 'autoCommit': true +} +``` + +2. Use the session created in Step 1 to submit queries + +Send a POST request to the `/system//queries` endpoint. + +The request submits queries to the target system and returns the release and version number of the target system. + +In the following example, the request includes: + +* `SELECT * FROM DBC.DBCInfo`: The query to the system with the alias ``. +* `'format': 'OBJECT'`: The format for response. +* `'Session' : `: The session ID returned in Step 1 to create an explicit session. + + + +Request + +``` python +# use this session to submit queries afterwards + +url = 'https://:1443/systems//queries' + +payload = { + 'query': 'SELECT * FROM DBC.DBCInfo;', + 'format': 'OBJECT', + 'session': 1366010 # <-- sessionId +} +payload_json = json.dumps(payload) + +response = requests.request('POST', url, headers=headers, data=payload_json, verify=False) + +print(response.text) +``` + +Response + +``` json +{ + "queueDuration":6, + "queryDuration":41, + "results":[ + { + "resultSet":true, + "data":[ + { + "InfoKey":"LANGUAGE SUPPORT MODE", + "InfoData":"Standard" + }, + { + "InfoKey":"RELEASE", + "InfoData":"15.10.07.02" + }, + { + "InfoKey":"VERSION", + "InfoData":"15.10.07.02" + } + ], + "rowCount":3, + "rowLimitExceeded":false + } + ] +} +``` + + +## Use asynchronous queries + +Use asynchronous queries when a system or network performance is affected by querying a large group of data or long running queries. + +1. Submit asynchronous queries to the target system and retrieve a Query ID + +Send a POST request to the `/system//queries` endpoint. + +In the following example, the request includes: + +* `SELECT * FROM DBC.DBCInfo`: The query to the system with the alias ``. +* `'format': 'OBJECT'`: The format for response. +* `'spooled_result_set': True`: The indication that the request is asynchronous. + + + +Request + +``` python +## Run async query . + +url = 'https://:1443/systems//queries' + +payload = { + 'query': 'SELECT * FROM DBC.DBCInfo;', + 'format': 'OBJECT', + 'spooled_result_set': True +} + +payload_json = json.dumps(payload) +response = requests.request('POST', url, headers=headers, data=payload_json, verify=False) + +print(response.text) +``` + +Response + +``` +{"id":1366025} +``` + + +2. Get query details using the ID retrieved from Step 1 ++ +Send a GET request to the `/system//queries/` endpoint, replacing `` with the ID retrieved from Step 1. ++ +The request returns the details of the specific query, including `*queryState*`, `*queueOrder*`, `*queueDuration*`, and so on. For a complete list of the response fields and their descriptions, see [Query Service Installation, Configuration, and Usage Guide](https://docs.teradata.com/r/Teradata-Query-Service-Installation-Configuration-and-Usage-Guide-for-Customers/April-2022/Using-the-Query-Service-APIs/Submitting-SQL-Statement/Request-Body). + +Request + +``` python +## response for async query . + +url = 'https://:1443/systems//queries/1366025' + +payload_json = json.dumps(payload) +response = requests.request('GET', url, headers=headers, verify=False) + +print(response.text) +``` + +Response + +``` +{ + "queryId":1366025, + "query":"SELECT * FROM DBC.DBCInfo;", + "batch":false, + "system":"testsystem", + "user":"dbc", + "session":1366015, + "queryState":"RESULT_SET_READY", + "queueOrder":0, + "queueDuration":6, + "queryDuration":9, + "statusCode":200, + "resultSets":{ + + }, + "counts":{ + + }, + "exceptions":{ + + }, + "outParams":{ + + } +} +``` + +3. View resultset for asynchronous query + +Send a GET request to the `/system//queries//results` endpoint, replacing `` with the ID retrieved from Step 1. +The request returns an array of the result sets and update counts produced by the submitted query. + +Request + +``` python +url = 'https://:1443/systems//queries/1366025/results' + +payload_json = json.dumps(payload) +response = requests.request('GET', url, headers=headers, verify=False) + +print(response.text) +``` + +Response + +``` json +{ + "queueDuration":6, + "queryDuration":9, + "results":[ + { + "resultSet":true, + "data":[ + { + "InfoKey":"LANGUAGE SUPPORT MODE", + "InfoData":"Standard" + }, + { + "InfoKey":"RELEASE", + "InfoData":"15.10.07.02" + }, + { + "InfoKey":"VERSION", + "InfoData":"15.10.07.02" + } + ], + "rowCount":3, + "rowLimitExceeded":false + } + ] +} +``` + +## Get a list of active or queued queries + +Send a GET request to the `/system//queries` endpoint. The request returns the IDs of active queries. + +Request + +``` python +url = 'https://:1443/systems//queries' + +payload={} + +response = requests.request('GET', url, headers=headers, data=payload, verify=False) + +print(response.json()) +``` + +Response + +``` json +[ + { + "queryId": 12516087, + "query": "SELECt * from dbcmgr.AlertRequest;", + "batch": false, + "system": "BasicTestSys", + "user": "dbc", + "session": 12516011, + "queryState": "REST_SET_READY", + "queueOrder": 0, + "queueDurayion": 3, + "queryDuration": 3, + "statusCode": 200, + "resultSets": {}, + "counts": {}, + "exceptions": {}, + "outparams": {} + }, + { + "queryId": 12516088, + "query": "SELECt * from dbc.DBQLAmpDataTbl;", + "batch": false, + "system": "BasicTestSys", + "user": "dbc", + "session": 12516011, + "queryState": "REST_SET_READY", + "queueOrder": 0, + "queueDurayion": 3, + "queryDuration": 3, + "statusCode": 200, + "resultSets": {}, + "counts": {}, + "exceptions": {}, + "outparams": {} + } +] +``` + + +## Resources + +* Features, examples, and references: [Query Service Installation, Configuration, and Usage Guide](https://docs.teradata.com/r/Teradata-Query-Service-Installation-Configuration-and-Usage-Guide-for-Customers/April-2022) +* [Query Service API OpenAPI Specification](https://downloads.teradata.com/api/teradata_query_service) + +import CommunityLinkPartial from '../_partials/community_link.mdx'; + + \ No newline at end of file diff --git a/quickstarts/create-applications/teradatasql.md b/quickstarts/create-applications/teradatasql.md new file mode 100644 index 0000000000..f79098e831 --- /dev/null +++ b/quickstarts/create-applications/teradatasql.md @@ -0,0 +1,95 @@ +--- +sidebar_position: 2 +id: teradatasql +author: Krutik Pathak +email: krutik.pathak@teradata.com +page_last_update: August 2nd, 2023 +description: How to connect to Teradata Vantage using teradatasql Python database driver +keywords: [data warehouses, compute storage separation, teradata, vantage, cloud data platform, teradatasql, python applications, business intelligence, enterprise analytics] +--- + +import ClearscapeDocsNote from '../_partials/vantage_clearscape_analytics.mdx' +import CommunityLink from '../_partials/community_link.mdx' + +# Connect to Vantage using Python + +## Overview + +This how-to demonstrates how to connect to Vantage using [teradatasql](https://github.com/Teradata/python-driver) Python database driver for Teradata Vantage. + +## Prerequisites + +* 64-bit Python 3.4 or later. + +* `teradatasql` driver installed in your system: +``` +pip install teradatasql +``` + +:::note +`teradatasql` package runs on Windows, macOS (10.14 Mojave or later) and Linux. For Linux, currently only Linux x86-64 architecture is supported. +::: + +* Access to a Teradata Vantage instance. Currently driver is supported for use with Teradata Database 16.10 and later releases. + + + +## Code to send a query + +This is a simple Python code to connect to Teradata Vantage using `teradatasql`. All that is left, is to pass connection and authentication parameters and run a query: + +```python +import teradatasql + +DB_URL = "" #Add Host +USER = "" #Add Username +PASS = "" #Add Password + +try: + # Establish a connection to the Teradata database + with teradatasql.connect(host=DB_URL, user=USER, password=PASS) as con: + # Create a cursor to execute queries + with con.cursor() as cur: + try: + # Creating the table SampleEmployee + cur.execute (f"CREATE SET TABLE {USER}.SampleEmployee \ + (Associate_Id INTEGER, \ + Associate_Name CHAR(25), \ + Job_Title VARCHAR(25)) \ + UNIQUE PRIMARY INDEX (Associate_Id);") + + print(f"Sample table {USER}.SampleEmployee created.") + + # Adding sample data into SampleEmployee table + cur.execute (f"INSERT INTO {USER}.SampleEmployee VALUES (1, 'Richard Hendricks','CEO')") + cur.execute (f"INSERT INTO {USER}.SampleEmployee VALUES (2, 'Jared Dunn','CFO')") + cur.execute (f"INSERT INTO {USER}.SampleEmployee VALUES (3, 'Jian Yang','Intern')") + + print(f"Sample data added to table {USER}.SampleEmployee.") + + # Execute the SELECT query to get the results from SampleEmployee table + cur.execute(f"SELECT * FROM {USER}.SampleEmployee") + + # Extract data from the result set and print it + for row in cur: + print(f"Associate ID: {row[0]}, Associate_Name: {row[1]}, Job_Title:{row[2]}") + + + + except teradatasql.DatabaseError as db_err: + # Handle any errors that occur during query execution + print("Error while executing the query:", db_err) + +except teradatasql.DatabaseError as db_err: + # Handle any errors that occur during the database connection + print("Error while connecting to the Teradata database:", db_err) +``` + +## Summary + +This how-to demonstrated how to connect to Teradata Vantage using `teradatasql` Python database driver. It described a sample Python code to send SQL queries to Teradata Vantage using `teradatasql`. + +## Further reading +* [teradatasql Python driver reference](https://github.com/Teradata/python-driver) + + diff --git a/quickstarts/elt/images/getting-started-with-airbyte-dbt/dbt_debug.png b/quickstarts/elt/images/getting-started-with-airbyte-dbt/dbt_debug.png new file mode 100644 index 0000000000..c6371d6f41 Binary files /dev/null and b/quickstarts/elt/images/getting-started-with-airbyte-dbt/dbt_debug.png differ diff --git a/quickstarts/elt/images/getting-started-with-airbyte-dbt/dbt_docs_generate.png b/quickstarts/elt/images/getting-started-with-airbyte-dbt/dbt_docs_generate.png new file mode 100644 index 0000000000..a7f9646556 Binary files /dev/null and b/quickstarts/elt/images/getting-started-with-airbyte-dbt/dbt_docs_generate.png differ diff --git a/quickstarts/elt/images/getting-started-with-airbyte-dbt/dbt_docs_serve.png b/quickstarts/elt/images/getting-started-with-airbyte-dbt/dbt_docs_serve.png new file mode 100644 index 0000000000..332a1d391e Binary files /dev/null and b/quickstarts/elt/images/getting-started-with-airbyte-dbt/dbt_docs_serve.png differ diff --git a/quickstarts/elt/images/getting-started-with-airbyte-dbt/dbt_init_database_name.png b/quickstarts/elt/images/getting-started-with-airbyte-dbt/dbt_init_database_name.png new file mode 100644 index 0000000000..26daeff834 Binary files /dev/null and b/quickstarts/elt/images/getting-started-with-airbyte-dbt/dbt_init_database_name.png differ diff --git a/quickstarts/elt/images/getting-started-with-airbyte-dbt/dbt_init_project_name.png b/quickstarts/elt/images/getting-started-with-airbyte-dbt/dbt_init_project_name.png new file mode 100644 index 0000000000..140e8841e7 Binary files /dev/null and b/quickstarts/elt/images/getting-started-with-airbyte-dbt/dbt_init_project_name.png differ diff --git a/quickstarts/elt/images/getting-started-with-airbyte-dbt/dbt_run.png b/quickstarts/elt/images/getting-started-with-airbyte-dbt/dbt_run.png new file mode 100644 index 0000000000..544201bd40 Binary files /dev/null and b/quickstarts/elt/images/getting-started-with-airbyte-dbt/dbt_run.png differ diff --git a/quickstarts/elt/images/getting-started-with-airbyte-dbt/dbt_test.png b/quickstarts/elt/images/getting-started-with-airbyte-dbt/dbt_test.png new file mode 100644 index 0000000000..5c8f871184 Binary files /dev/null and b/quickstarts/elt/images/getting-started-with-airbyte-dbt/dbt_test.png differ diff --git a/quickstarts/elt/images/getting-started-with-airbyte-dbt/raw_data_vantage_dbeaver.png b/quickstarts/elt/images/getting-started-with-airbyte-dbt/raw_data_vantage_dbeaver.png new file mode 100644 index 0000000000..79f2f94efb Binary files /dev/null and b/quickstarts/elt/images/getting-started-with-airbyte-dbt/raw_data_vantage_dbeaver.png differ diff --git a/quickstarts/elt/images/getting-started-with-airbyte/close_airbyte_connection.png b/quickstarts/elt/images/getting-started-with-airbyte/close_airbyte_connection.png new file mode 100644 index 0000000000..26c8d4ac74 Binary files /dev/null and b/quickstarts/elt/images/getting-started-with-airbyte/close_airbyte_connection.png differ diff --git a/quickstarts/elt/images/getting-started-with-airbyte/configuring_destination_teradata_airbyte.png b/quickstarts/elt/images/getting-started-with-airbyte/configuring_destination_teradata_airbyte.png new file mode 100644 index 0000000000..5150ff9bcb Binary files /dev/null and b/quickstarts/elt/images/getting-started-with-airbyte/configuring_destination_teradata_airbyte.png differ diff --git a/quickstarts/elt/images/getting-started-with-airbyte/configuring_source_gsheet_airbyte.png b/quickstarts/elt/images/getting-started-with-airbyte/configuring_source_gsheet_airbyte.png new file mode 100644 index 0000000000..35c45ebf2d Binary files /dev/null and b/quickstarts/elt/images/getting-started-with-airbyte/configuring_source_gsheet_airbyte.png differ diff --git a/quickstarts/elt/images/getting-started-with-airbyte/create_first_connection.png b/quickstarts/elt/images/getting-started-with-airbyte/create_first_connection.png new file mode 100644 index 0000000000..62630a71ea Binary files /dev/null and b/quickstarts/elt/images/getting-started-with-airbyte/create_first_connection.png differ diff --git a/quickstarts/elt/images/getting-started-with-airbyte/data_sync_summary.png b/quickstarts/elt/images/getting-started-with-airbyte/data_sync_summary.png new file mode 100644 index 0000000000..5af214d377 Binary files /dev/null and b/quickstarts/elt/images/getting-started-with-airbyte/data_sync_summary.png differ diff --git a/quickstarts/elt/images/getting-started-with-airbyte/data_sync_validation_in_teradata.png b/quickstarts/elt/images/getting-started-with-airbyte/data_sync_validation_in_teradata.png new file mode 100644 index 0000000000..9693013513 Binary files /dev/null and b/quickstarts/elt/images/getting-started-with-airbyte/data_sync_validation_in_teradata.png differ diff --git a/quickstarts/elt/images/getting-started-with-airbyte/delete_airbyte_connection.png b/quickstarts/elt/images/getting-started-with-airbyte/delete_airbyte_connection.png new file mode 100644 index 0000000000..bc58221808 Binary files /dev/null and b/quickstarts/elt/images/getting-started-with-airbyte/delete_airbyte_connection.png differ diff --git a/quickstarts/elt/images/getting-started-with-airbyte/namespaces_in_destination.png b/quickstarts/elt/images/getting-started-with-airbyte/namespaces_in_destination.png new file mode 100644 index 0000000000..2a8cdb4034 Binary files /dev/null and b/quickstarts/elt/images/getting-started-with-airbyte/namespaces_in_destination.png differ diff --git a/quickstarts/elt/images/getting-started-with-airbyte/replication_frequency_24hr.png b/quickstarts/elt/images/getting-started-with-airbyte/replication_frequency_24hr.png new file mode 100644 index 0000000000..9984ee5865 Binary files /dev/null and b/quickstarts/elt/images/getting-started-with-airbyte/replication_frequency_24hr.png differ diff --git a/quickstarts/elt/images/getting-started-with-airbyte/replication_frequency_cron_expression.png b/quickstarts/elt/images/getting-started-with-airbyte/replication_frequency_cron_expression.png new file mode 100644 index 0000000000..af94e87348 Binary files /dev/null and b/quickstarts/elt/images/getting-started-with-airbyte/replication_frequency_cron_expression.png differ diff --git a/quickstarts/elt/images/getting-started-with-airbyte/sample_employees_payrate_google_sheets.png b/quickstarts/elt/images/getting-started-with-airbyte/sample_employees_payrate_google_sheets.png new file mode 100644 index 0000000000..70fab27a21 Binary files /dev/null and b/quickstarts/elt/images/getting-started-with-airbyte/sample_employees_payrate_google_sheets.png differ diff --git a/quickstarts/elt/images/getting-started-with-airbyte/specify_preferences.png b/quickstarts/elt/images/getting-started-with-airbyte/specify_preferences.png new file mode 100644 index 0000000000..c1db5f29a0 Binary files /dev/null and b/quickstarts/elt/images/getting-started-with-airbyte/specify_preferences.png differ diff --git a/quickstarts/elt/images/terraform-airbyte-provider/AirbyteCloudTerraform.png b/quickstarts/elt/images/terraform-airbyte-provider/AirbyteCloudTerraform.png new file mode 100644 index 0000000000..4ea87d1c59 Binary files /dev/null and b/quickstarts/elt/images/terraform-airbyte-provider/AirbyteCloudTerraform.png differ diff --git a/quickstarts/elt/images/terraform-airbyte-provider/airbyteconnection.png b/quickstarts/elt/images/terraform-airbyte-provider/airbyteconnection.png new file mode 100644 index 0000000000..035f470759 Binary files /dev/null and b/quickstarts/elt/images/terraform-airbyte-provider/airbyteconnection.png differ diff --git a/quickstarts/elt/images/terraform-airbyte-provider/extensions.png b/quickstarts/elt/images/terraform-airbyte-provider/extensions.png new file mode 100644 index 0000000000..7318bd9c20 Binary files /dev/null and b/quickstarts/elt/images/terraform-airbyte-provider/extensions.png differ diff --git a/quickstarts/elt/images/terraform-airbyte-provider/terraformapply.png b/quickstarts/elt/images/terraform-airbyte-provider/terraformapply.png new file mode 100644 index 0000000000..bc63600db8 Binary files /dev/null and b/quickstarts/elt/images/terraform-airbyte-provider/terraformapply.png differ diff --git a/quickstarts/elt/images/terraform-airbyte-provider/terraforminit.png b/quickstarts/elt/images/terraform-airbyte-provider/terraforminit.png new file mode 100644 index 0000000000..6b33ba1d26 Binary files /dev/null and b/quickstarts/elt/images/terraform-airbyte-provider/terraforminit.png differ diff --git a/quickstarts/elt/images/terraform-airbyte-provider/terraformplan.png b/quickstarts/elt/images/terraform-airbyte-provider/terraformplan.png new file mode 100644 index 0000000000..6b2009f448 Binary files /dev/null and b/quickstarts/elt/images/terraform-airbyte-provider/terraformplan.png differ diff --git a/quickstarts/get-access-to-vantage/_category_.json b/quickstarts/get-access-to-vantage/_category_.json new file mode 100644 index 0000000000..3052b572fc --- /dev/null +++ b/quickstarts/get-access-to-vantage/_category_.json @@ -0,0 +1,4 @@ +{ + "label": "Get access to Vantage", + "position": 2 + } \ No newline at end of file diff --git a/quickstarts/get-access-to-vantage/clearscape-analytics-experience/_category_.json b/quickstarts/get-access-to-vantage/clearscape-analytics-experience/_category_.json new file mode 100644 index 0000000000..8e80a51399 --- /dev/null +++ b/quickstarts/get-access-to-vantage/clearscape-analytics-experience/_category_.json @@ -0,0 +1,4 @@ +{ + "label": "ClearScape Analytics Experience", + "position": 3 + } \ No newline at end of file diff --git a/quickstarts/get-access-to-vantage/clearscape-analytics-experience/getting-started-with-csae.md b/quickstarts/get-access-to-vantage/clearscape-analytics-experience/getting-started-with-csae.md new file mode 100644 index 0000000000..d59c347c50 --- /dev/null +++ b/quickstarts/get-access-to-vantage/clearscape-analytics-experience/getting-started-with-csae.md @@ -0,0 +1,76 @@ +--- +sidebar_position: 1 +author: Vidhan Bhonsle +email: vidhan.bhonsle@teradata.com +page_last_update: February 9th, 2024 +description: Getting started with ClearScape Analytics Experience +keywords: [data warehouses, compute storage separation, teradata, vantage, cloud data platform, business intelligence, enterprise analytics, jupyter, teradatasql, ipython-sql, clearscape, csae] +--- + +# Getting started with ClearScape Analytics Experience + +## Overview + +[ClearScape Analytics^TM^](https://www.teradata.com/platform/clearscape-analytics) is a powerful analytics engine in [Teradata VantageCloud](https://www.teradata.com/platform/vantagecloud). It delivers breakthrough performance, value, and growth across the enterprise with the most powerful, open and connected AI/ML capabilities on the market. You can experience ClearClearScape Analytics^TM^ and Teradata Vantage, in a non-production setting, through [ClearScape Analytics Experience](https://www.teradata.com/experience). + +In this how-to we will go through the steps for creating an environment in ClearScape Analytics Experience and access demos. + +![VantageCloud](../../images/VantageCloud.png) + +## Create a ClearScape Analytics Experience account + +Head over to [ClearScape Analytics Experience](https://www.teradata.com/experience) and create a free account. + +![Register](../../images/csae_register.png) + +Sign in to your [ClearScape Analytics account](https://clearscape.teradata.com/sign-in) to create an environment and access demos. + +![Sign in](../../images/csae_signin.png) + +## Create an Environment + +Once signed in, click on *CREATE ENVIRONMENT* + +![Create environment](../../images/csae_create_env.png) + +You will need to provide: + +| Variable | Value | +|----------------------|-----------------------------------------------------------------------| +| **environment name** | A name for your environment, e.g. "demo" | +| **database password**| A password of your choice, this password will be assigned to `dbc` and `demo_user` users | +| **Region** | Select a region from the dropdown | + + +:::important +Note down the database password. You will need it to connect to the database. +::: + +![Environment params](../../images/csae_env_params.png) + +Click on *CREATE* button to complete the creation of your environment and now, you can see details of your environment. + +![Environment details](../../images/csae_env_details.png) + +## Access demos + +The ClearScape Analytics Experience environment includes a variety of demos that showcase how to use analytics to solve business problems across many industries. + +To access demos, click on *RUN DEMOS USING JUPYTER* button. It will open a Jupyter environment in a new tab of your browser. + +:::note +You can find all the detail of demos on the demo index page. +::: + +![Usecases folder](../../images/csae_jupyter.png) + + +## Summary + +In this quick start, we learned how to create an environment in ClearScape Analytics Experience and access demos. + +## Further reading + +* [ClearScape Analytics Experience API documentation](https://api.clearscape.teradata.com/api-docs/) +* [Teradata Documentation](https://docs.teradata.com/) + diff --git a/quickstarts/get-access-to-vantage/on-your-cloud-infrastructure/_category_.json b/quickstarts/get-access-to-vantage/on-your-cloud-infrastructure/_category_.json new file mode 100644 index 0000000000..52ed34ffc7 --- /dev/null +++ b/quickstarts/get-access-to-vantage/on-your-cloud-infrastructure/_category_.json @@ -0,0 +1,4 @@ +{ + "label": "On your cloud infrastructure", + "position": 2 + } \ No newline at end of file diff --git a/quickstarts/get-access-to-vantage/on-your-cloud-infrastructure/run-vantage-express-on-aws.md b/quickstarts/get-access-to-vantage/on-your-cloud-infrastructure/run-vantage-express-on-aws.md new file mode 100644 index 0000000000..28d4df14f5 --- /dev/null +++ b/quickstarts/get-access-to-vantage/on-your-cloud-infrastructure/run-vantage-express-on-aws.md @@ -0,0 +1,275 @@ +--- +sidebar_position: 1 +author: Adam Tworkiewicz +email: adam.tworkiewicz@teradata.com +page_last_update: December 12th, 2022 +description: Run Vantage Express on AWS. +keywords: [data warehouses, compute storage separation, teradata, vantage, cloud data platform, object storage, business intelligence, enterprise analytics, AWS] +--- + +import UseCase from '../../_partials/use-csae.mdx'; +import CommunityLink from '../../_partials/community_link.mdx'; +import InstallVeInPublic from '../../_partials/install-ve-in-public.mdx'; + +# Run Vantage Express on AWS + + + +## Overview + +This how-to demonstrates how to run Vantage Express on AWS. Vantage Express is a small footprint configuration that contains a fully functional Teradata SQL Engine. + + +:::important +Cloud charges + +Vantage Express is distributed as a virtual machine image. This how-to uses the EC2 `c5n.metal` instance type. It's a bare metal instance that costs over $3/h. + +If you want a cheaper option, try [Google Cloud](./vantage-express-gcp.md) and [Azure](run-vantage-express-on-microsoft-azure.md) which support nested virtualization and can run Vantage Express on cheap VM's. + +If you do not wish to pay for cloud usage, you can get a free hosted instance of Vantage at [](https://clearscape.teradata.com/). Alternatively, you install Vantage Express locally using xref:getting.started.vmware.adoc[VMware], [VirtualBox](../on-your-local/getting-started-vbox.md), or [UTM](../on-your-local/getting-started-utm.md). +::: + +## Prerequisites + +* An AWS account. If you need to create a new account follow [the official AWS instructions](https://aws.amazon.com/premiumsupport/knowledge-center/create-and-activate-aws-account/). +* `awscli` command line utility installed and configured on your machine. You can find installation instructions here: [](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html). + +## Installation + + +* You will need a VPC with an Internet-facing subnet. If you don't have one available, here is how you can create it: + +```bash +# Copied from https://cloudaffaire.com/how-to-create-a-custom-vpc-using-aws-cli/ + +# Create VPC +AWS_VPC_ID=$(aws ec2 create-vpc \ + --cidr-block 10.0.0.0/16 \ + --query 'Vpc.{VpcId:VpcId}' \ + --output text) + +# Enable DNS hostname for your VPC +aws ec2 modify-vpc-attribute \ + --vpc-id $AWS_VPC_ID \ + --enable-dns-hostnames "{\"Value\":true}" + +# Create a public subnet +AWS_SUBNET_PUBLIC_ID=$(aws ec2 create-subnet \ + --vpc-id $AWS_VPC_ID --cidr-block 10.0.1.0/24 \ + --query 'Subnet.{SubnetId:SubnetId}' \ + --output text) + +# Enable Auto-assign Public IP on Public Subnet +aws ec2 modify-subnet-attribute \ + --subnet-id $AWS_SUBNET_PUBLIC_ID \ + --map-public-ip-on-launch + +# Create an Internet Gateway +AWS_INTERNET_GATEWAY_ID=$(aws ec2 create-internet-gateway \ + --query 'InternetGateway.{InternetGatewayId:InternetGatewayId}' \ + --output text) + +# Attach Internet gateway to your VPC +aws ec2 attach-internet-gateway \ + --vpc-id $AWS_VPC_ID \ + --internet-gateway-id $AWS_INTERNET_GATEWAY_ID + +# Create a route table +AWS_CUSTOM_ROUTE_TABLE_ID=$(aws ec2 create-route-table \ + --vpc-id $AWS_VPC_ID \ + --query 'RouteTable.{RouteTableId:RouteTableId}' \ + --output text ) + +# Create route to Internet Gateway +aws ec2 create-route \ + --route-table-id $AWS_CUSTOM_ROUTE_TABLE_ID \ + --destination-cidr-block 0.0.0.0/0 \ + --gateway-id $AWS_INTERNET_GATEWAY_ID \ + --output text + +# Associate the public subnet with route table +AWS_ROUTE_TABLE_ASSOID=$(aws ec2 associate-route-table \ + --subnet-id $AWS_SUBNET_PUBLIC_ID \ + --route-table-id $AWS_CUSTOM_ROUTE_TABLE_ID \ + --output text | head -1) + +# Create a security group +aws ec2 create-security-group \ + --vpc-id $AWS_VPC_ID \ + --group-name myvpc-security-group \ + --description 'My VPC non default security group' \ + --output text + +# Get security group ID's +AWS_DEFAULT_SECURITY_GROUP_ID=$(aws ec2 describe-security-groups \ + --filters "Name=vpc-id,Values=$AWS_VPC_ID" \ + --query 'SecurityGroups[?GroupName == `default`].GroupId' \ + --output text) && + AWS_CUSTOM_SECURITY_GROUP_ID=$(aws ec2 describe-security-groups \ + --filters "Name=vpc-id,Values=$AWS_VPC_ID" \ + --query 'SecurityGroups[?GroupName == `myvpc-security-group`].GroupId' \ + --output text) + +# Create security group ingress rules +aws ec2 authorize-security-group-ingress \ + --group-id $AWS_CUSTOM_SECURITY_GROUP_ID \ + --ip-permissions '[{"IpProtocol": "tcp", "FromPort": 22, "ToPort": 22, "IpRanges": [{"CidrIp": "0.0.0.0/0", "Description": "Allow SSH"}]}]' \ + --output text + +# Add a tag to the VPC +aws ec2 create-tags \ + --resources $AWS_VPC_ID \ + --tags "Key=Name,Value=vantage-express-vpc" + +# Add a tag to public subnet +aws ec2 create-tags \ + --resources $AWS_SUBNET_PUBLIC_ID \ + --tags "Key=Name,Value=vantage-express-vpc-public-subnet" + +# Add a tag to the Internet-Gateway +aws ec2 create-tags \ + --resources $AWS_INTERNET_GATEWAY_ID \ + --tags "Key=Name,Value=vantage-express-vpc-internet-gateway" + +# Add a tag to the default route table +AWS_DEFAULT_ROUTE_TABLE_ID=$(aws ec2 describe-route-tables \ + --filters "Name=vpc-id,Values=$AWS_VPC_ID" \ + --query 'RouteTables[?Associations[0].Main != `false`].RouteTableId' \ + --output text) && + aws ec2 create-tags \ + --resources $AWS_DEFAULT_ROUTE_TABLE_ID \ + --tags "Key=Name,Value=vantage-express-vpc-default-route-table" + +# Add a tag to the public route table +aws ec2 create-tags \ + --resources $AWS_CUSTOM_ROUTE_TABLE_ID \ + --tags "Key=Name,Value=vantage-express-vpc-public-route-table" + +# Add a tags to security groups +aws ec2 create-tags \ + --resources $AWS_CUSTOM_SECURITY_GROUP_ID \ + --tags "Key=Name,Value=vantage-express-vpc-security-group" && + aws ec2 create-tags \ + --resources $AWS_DEFAULT_SECURITY_GROUP_ID \ + --tags "Key=Name,Value=vantage-express-vpc-default-security-group" + +``` + +* To create a VM you will need an ssh key pair. If you don't have it already, create one: +```bash +aws ec2 create-key-pair --key-name vantage-key --query 'KeyMaterial' --output text > vantage-key.pem +``` + +* Restrict access to the private key. Replace `` with the private key path returned by the previous command: + +```bash +chmod 600 vantage-key.pem +``` +* Get the AMI id of the latest Ubuntu image in your region: +```bash +AWS_AMI_ID=$(aws ec2 describe-images \ + --filters 'Name=name,Values=ubuntu/images/hvm-ssd/ubuntu-*amd64*' \ + --query 'Images[*].[Name,ImageId,CreationDate]' --output text \ + | sort -k3 -r | head -n1 | cut -f 2) +``` +* Create a Ubuntu VM with 4 CPU's and 8GB of RAM, and a 70GB disk. +```bash +AWS_INSTANCE_ID=$(aws ec2 run-instances \ + --image-id $AWS_AMI_ID \ + --count 1 \ + --instance-type c5n.metal \ + --block-device-mapping DeviceName=/dev/sda1,Ebs={VolumeSize=70} \ + --key-name vantage-key \ + --security-group-ids $AWS_CUSTOM_SECURITY_GROUP_ID \ + --subnet-id $AWS_SUBNET_PUBLIC_ID \ + --query 'Instances[0].InstanceId' \ + --output text) +``` +* ssh to your VM: +```bash +AWS_INSTANCE_PUBLIC_IP=$(aws ec2 describe-instances \ + --query "Reservations[*].Instances[*].PublicIpAddress" \ + --output=text --instance-ids $AWS_INSTANCE_ID) +ssh -i vantage-key.pem ubuntu@$AWS_INSTANCE_PUBLIC_IP +``` + +* Once in the VM, switch to `root` user: +```bash +sudo -i +``` + +* Prepare the download directory for Vantage Express: +```bash +mkdir /opt/downloads +cd /opt/downloads +``` + + +* If you would like to connect to Vantage Express from the Internet, you will need to open up firewall holes to your VM. You should also change the default password to `dbc` user: +* To change the password for `dbc` user go to your VM and start bteq: +```bash +bteq +``` + +* Login to your database using `dbc` as username and password: +```bash +.logon localhost/dbc +``` + +* Change the password for `dbc` user: +```bash +MODIFY USER dbc AS PASSWORD = new_password; +``` + +* You can now open up port 1025 to the internet: +```bash +aws ec2 authorize-security-group-ingress \ + --group-id $AWS_CUSTOM_SECURITY_GROUP_ID \ + --ip-permissions '[{"IpProtocol": "tcp", "FromPort": 1025, "ToPort": 1025, "IpRanges": [{"CidrIp": "0.0.0.0/0", "Description": "Allow Teradata port"}]}]' +``` + +## Cleanup +To stop incurring charges, delete all the resources: +```bash +# Delete the VM +aws ec2 terminate-instances --instance-ids $AWS_INSTANCE_ID --output text + +# Wait for the VM to terminate + +# Delete custom security group +aws ec2 delete-security-group \ + --group-id $AWS_CUSTOM_SECURITY_GROUP_ID + +# Delete internet gateway +aws ec2 detach-internet-gateway \ + --internet-gateway-id $AWS_INTERNET_GATEWAY_ID \ + --vpc-id $AWS_VPC_ID && + aws ec2 delete-internet-gateway \ + --internet-gateway-id $AWS_INTERNET_GATEWAY_ID + +# Delete the custom route table +aws ec2 disassociate-route-table \ + --association-id $AWS_ROUTE_TABLE_ASSOID && + aws ec2 delete-route-table \ + --route-table-id $AWS_CUSTOM_ROUTE_TABLE_ID + +# Delete the public subnet +aws ec2 delete-subnet \ + --subnet-id $AWS_SUBNET_PUBLIC_ID + +# Delete the vpc +aws ec2 delete-vpc \ + --vpc-id $AWS_VPC_ID +``` + +## Next steps +* [Query data stored in object storage](../../manage-data/nos.md) + +## Further reading +* [Teradata® Studio™ and Studio™ Express Installation Guide](https://docs.teradata.com/r/Teradata-StudioTM-and-StudioTM-Express-Installation-Guide-17.20) +* [Introduction to BTEQ](https://docs.teradata.com/r/jmAxXLdiDu6NiyjT6hhk7g/root) + + + + diff --git a/quickstarts/get-access-to-vantage/on-your-cloud-infrastructure/run-vantage-express-on-microsoft-azure.md b/quickstarts/get-access-to-vantage/on-your-cloud-infrastructure/run-vantage-express-on-microsoft-azure.md new file mode 100644 index 0000000000..d7786f0e9d --- /dev/null +++ b/quickstarts/get-access-to-vantage/on-your-cloud-infrastructure/run-vantage-express-on-microsoft-azure.md @@ -0,0 +1,116 @@ +--- +sidebar_position: 3 +author: Adam Tworkiewicz +email: adam.tworkiewicz@teradata.com +page_last_update: August 23rd, 2022 +description: Run Vantage Express on Microsoft Azure. +keywords: [data warehouses, compute storage separation, teradata, vantage, cloud data platform, object storage, business intelligence, enterprise analytics] +--- + +import UseCase from '../../_partials/use-csae.mdx'; +import Tabs from '../../_partials/tabsAzure.mdx'; +import InstallVeInPublic from '../../_partials/install-ve-in-public.mdx'; +import CommunityLink from '../../_partials/community_link.mdx'; + +# Run Vantage Express on Azure + + + +## Overview + +This how-to demonstrates how to run Vantage Express in Microsoft Azure. Vantage Express contains a fully functional Teradata SQL Engine. + +## Prerequisites + +* An Azure account. You can create one here: https://azure.microsoft.com/en-us/free/ +* `az` command line utility installed on your machine. You can find installation instructions here: https://docs.microsoft.com/en-us/cli/azure/install-azure-cli. + +## Installation + +* Setup the default region to the closest region to you (to list locations run `az account list-locations -o table`): + +```bash +az config set defaults.location= +``` + +* Create a new resource group called `tdve-resource-group` and add it to defaults: +````bash +az group create -n tdve-resource-group +az config set defaults.group=tdve-resource-group +```` + +* To create a VM you will need an ssh key pair. If you don't have it already, create one: +````bash +az sshkey create --name vantage-ssh-key +```` + +* Restrict access to the private key. Replace `` with the private key path returned by the previous command: +````bash +chmod 600 +```` + +* Create a Ubuntu VM with 4 CPU's and 8GB of RAM, a 30GB os disk and a 60GB data disk. + + + +* ssh to your VM. Replace `` and `` with values that match your environment: +```bash +ssh -i azureuser@ +``` + +* Once in the VM, switch to `root` user: +```bash +sudo -i +``` + +* Prepare the download directory for Vantage Express: +```bash +mkdir /opt/downloads +cd /opt/downloads +``` + +* Mount the data disk: +```bash +parted /dev/sdc --script mklabel gpt mkpart xfspart xfs 0% 100% +mkfs.xfs /dev/sdc1 +partprobe /dev/sdc1 +export DISK_UUID=$(blkid | grep sdc1 | cut -d"\"" -f2) +echo "UUID=$DISK_UUID /opt/downloads xfs defaults,nofail 1 2" >> /etc/fstab +``` + + + +* If you would like to connect to Vantage Express from the Internet, you will need to open up firewall holes to your VM. You should also change the default password to `dbc` user: +* To change the password for `dbc` user go to your VM and start bteq: +```bash +bteq +``` + +* Login to your database using `dbc` as username and password: +```bash +.logon localhost/dbc +``` + +* Change the password for `dbc` user: +```bash +MODIFY USER dbc AS PASSWORD = new_password; +``` + +* You can now open up port 1025 to the internet using gcloud command: +```bash +az vm open-port --name teradata-vantage-express --port 1025 +``` + +To stop incurring charges, delete all the resources associated with the resource group: +```bash +az group delete --no-wait -n tdve-resource-group +``` + +## Next steps +* [Query data stored in object storage](../../manage-data/nos.md) + +## Further reading +* [Teradata® Studio™ and Studio™ Express Installation Guide](https://docs.teradata.com/r/Teradata-StudioTM-and-StudioTM-Express-Installation-Guide-17.20) +* [Introduction to BTEQ](https://docs.teradata.com/r/jmAxXLdiDu6NiyjT6hhk7g/root) + + diff --git a/quickstarts/get-access-to-vantage/on-your-cloud-infrastructure/vantage-express-gcp.md b/quickstarts/get-access-to-vantage/on-your-cloud-infrastructure/vantage-express-gcp.md new file mode 100644 index 0000000000..7daa211284 --- /dev/null +++ b/quickstarts/get-access-to-vantage/on-your-cloud-infrastructure/vantage-express-gcp.md @@ -0,0 +1,101 @@ +--- +sidebar_position: 2 +author: Adam Tworkiewicz +email: adam.tworkiewicz@teradata.com +page_last_update: August 23rd, 2022 +description: Run Vantage Express on Google Cloud. +keywords: [data warehouses, compute storage separation, teradata, vantage, cloud data platform, object storage, business intelligence, enterprise analytics] +--- + +import UseCase from '../../_partials/use-csae.mdx'; +import CommunityLink from '../../_partials/community_link.mdx'; +import InstallVeInPublic from '../../_partials/install-ve-in-public.mdx'; +import Tabs from '../../_partials/tabsGCP.mdx'; + +# Run Vantage Express on Google Cloud + + + +## Overview + +This how-to demonstrates how to run Vantage Express in Google Cloud Platform. Vantage Express contains a fully functional Teradata SQL Engine. + +:::note +If do not wish to pay for cloud usage you can install Vantage Express locally using [VMware](../on-your-local/getting-started-vmware.md), [VirtualBox](../on-your-local/getting-started-vbox.md), [UTM](../on-your-local/getting-started-utm.md). +::: + +## Prerequisites + +* A Google Cloud account. +* `gcloud` command line utility installed on your machine. You can find installation instructions here: https://cloud.google.com/sdk/docs/install. + +## Installation +1. Create a Ubuntu VM with 4 CPU's and 8GB of RAM, a 70GB balanced disk. The following command creates a VM in `us-central1` region. For best performance, replace the region with one that is the closest to you. For the list of supported regions see [Google Cloud regions documentation](https://cloud.google.com/compute/docs/regions-zones). + + + + +2. ssh to your VM: + +```bash +gcloud compute ssh teradata-vantage-express --zone=us-central1-a +``` + +3. Switch to `root` user: + +```bash +sudo -i +``` + +4. Prepare the download directory for Vantage Express: + +```bash +mkdir /opt/downloads +cd /opt/downloads +``` + + + +* If you would like to connect to Vantage Express from the Internet, you will need to open up firewall holes to your VM. You should also change the default password to `dbc` user: +* To change the password for `dbc` user go to your VM and start + +bteq: + +``` +bteq +``` + +* Login to your database using `dbc` as username and password: +``` +.logon localhost/dbc +``` +* Change the password for `dbc` user: +``` +MODIFY USER dbc AS PASSWORD = new_password; +``` + +* You can now open up port 1025 to the internet using gcloud command: +``` +gcloud compute firewall-rules create vantage-express --allow=tcp:1025 --direction=IN --target-tags=ve +``` + +## Cleanup + +To stop incurring charges, delete the VM: +``` +gcloud compute instances delete teradata-vantage-express --zone=us-central1-a +``` + +Also, remember to remove any firewall rules that you have added, e.g.: +``` +gcloud compute firewall-rules delete vantage-express +``` + +## Next steps +* [Query data stored in object storage](../../manage-data/nos.md) + +## Further reading +* [Teradata® Studio™ and Studio™ Express Installation Guide](https://docs.teradata.com/r/Teradata-StudioTM-and-StudioTM-Express-Installation-Guide-17.20) +* [Introduction to BTEQ](https://docs.teradata.com/r/jmAxXLdiDu6NiyjT6hhk7g/root) + + diff --git a/quickstarts/get-access-to-vantage/on-your-local/_category_.json b/quickstarts/get-access-to-vantage/on-your-local/_category_.json new file mode 100644 index 0000000000..d6724457b7 --- /dev/null +++ b/quickstarts/get-access-to-vantage/on-your-local/_category_.json @@ -0,0 +1,4 @@ +{ + "label": "On your local", + "position": 1 + } \ No newline at end of file diff --git a/quickstarts/get-access-to-vantage/on-your-local/getting-started-utm.md b/quickstarts/get-access-to-vantage/on-your-local/getting-started-utm.md new file mode 100644 index 0000000000..8286b952e2 --- /dev/null +++ b/quickstarts/get-access-to-vantage/on-your-local/getting-started-utm.md @@ -0,0 +1,92 @@ +--- +sidebar_position: 3 +id: run-vantage-express-on-utm +title: Run Vantage Express on UTM +author: Adam Tworkiewicz +email: adam.tworkiewicz@teradata.com +page_last_update: January 9th, 2023 +description: Run Vantage Express on your laptop using VMware. +keywords: [data warehouses, compute storage separation, teradata, vantage, cloud data platform, object storage, business intelligence, enterprise analytics] + +--- +import GettingStartedIntro from '../../_partials/getting-started-intro.mdx'; +import RunVantage from '../../_partials/run-vantage.mdx'; +import RunSimpleQueries from '../../_partials/running-sample-queries.mdx'; +import GettingStartedSummary from '../../_partials/getting-started-summary.mdx'; +import CommunityLink from '../../_partials/community_link.mdx'; + +# Run Vantage Express on UTM + + + +## Prerequisites + +1. A Mac computer. Both Intel and M1/2 chips are supported. + +:::note +Vantage Express runs on x86 architecture. When you run the VM on M1/2 chips, UTM has to emulate x86. This is significantly slower then virtualization. If you determine that Vantage Express on M1/M2 is too slow for your needs, consider running Vantage Express in the cloud: [AWS](../on-your-cloud-infrastructure/run-vantage-express-on-aws.md), [Azure](../on-your-cloud-infrastructure/run-vantage-express-on-microsoft-azure.md), [Google Cloud](../on-your-cloud-infrastructure/vantage-express-gcp.md). +::: + +2. 30GB of disk space and enough CPU and RAM to be able to dedicate at least one core and 4GB RAM to the virtual machine. +3. Admin rights to be able to install and run the software. + +:::note +No admin rights on your local machine? Have a look at how to run Vantage Express in [AWS](../on-your-cloud-infrastructure/run-vantage-express-on-aws.md), [Azure](../on-your-cloud-infrastructure/run-vantage-express-on-microsoft-azure.md), [Google Cloud](../on-your-cloud-infrastructure/vantage-express-gcp.md). +::: +[test](./getting-started-vbox.md), +[Google Cloud](../on-your-cloud-infrastructure/vantage-express-gcp.md) +## Installation + +### Download required software + +1. The latest version of [Vantage Express](https://downloads.teradata.com/download/database/teradata-express-for-vmware-player). If you have not used the Teradata downloads website before, you will need to register. +2. The latest version of [UTM](https://mac.getutm.app). + +### Run UTM installer + +1. Install UTM by running the installer and accepting the default values. + +### Run Vantage Express + +1. Go to the directory where you downloaded Vantage Express and unzip the downloaded file. +2. Start UTM, click on the `+` sign and select `Virtualize` (for Intel Macs) or `Emulate` (for M1 Macs). +3. On `Operating System` screen select `Other`. +4. On `Other` screen select `Skip ISO Boot`. +5. On `Hardware` screen allocate at least 4GB of memory and at least 1 CPU core. We recommend 10GB RAM and 2 CPUs. + +![UTM Hardware](../../images/utm.hardware.png) + +6. On `Storage` screen accept the defaults by clicking `Next`. +7. On `Shared Direct` screen click `Next`. +8. On `Summary` screen check `Open VM Settings` and click `Save`. +9. Go through the setup wizard. You only need to adjust the following tabs: + - *QEMU* - disable `UEFI Boot` option + - *Network* - expose ssh (22) and Vantage (1025) ports on the host computer: + +![UTM Network](../../images/utm.network.png) + +10. Map drives: +* Delete the default `IDE Drive`. +* Map the 3 Vantage Express drives by importing the disk files from the downloaded VM zip file. Make sure you map them in the right order, `-disk1`, `-disk2`, `-disk3` . The first disk is bootable and contains the database itself. Disks 2 and 3 are so called `pdisks` and contain data. As you import the files UTM will automatically convert them fro `vmdk` into `qcow2` format. Make sure that each disk is configured using the `IDE` interface: + +![UTM Drives](../../images/utm.drives.png) + +* Once you are done mapping all 3 drives, your configuration should look like this: + +![UTM Drives Final](../../images/utm.final.png) + +11. Save the configuration and start the VM. + + + + + + + +## Next steps +* [Query data stored in object storage](../../manage-data/nos.md) + +## Further reading +* [Teradata® Studio™ and Studio™ Express Installation Guide](https://docs.teradata.com/r/Teradata-StudioTM-and-StudioTM-Express-Installation-Guide-17.20) + + diff --git a/quickstarts/get-access-to-vantage/on-your-local/getting-started-vbox.md b/quickstarts/get-access-to-vantage/on-your-local/getting-started-vbox.md new file mode 100644 index 0000000000..d1bdf066fb --- /dev/null +++ b/quickstarts/get-access-to-vantage/on-your-local/getting-started-vbox.md @@ -0,0 +1,94 @@ +--- +sidebar_position: 2 +author: Adam Tworkiewicz +email: adam.tworkiewicz@teradata.com +page_last_update: January 9th, 2023 +description: Run Vantage Express on your laptop using VMware. +keywords: [data warehouses, compute storage separation, teradata, vantage, cloud data platform, object storage, business intelligence, enterprise analytics] +--- +import GettingStartedIntro from '../../_partials/getting-started-intro.mdx'; +import RunVantage from '../../_partials/run-vantage.mdx'; +import RunSimpleQueries from '../../_partials/running-sample-queries.mdx'; +import GettingStartedSummary from '../../_partials/getting-started-summary.mdx'; +import CommunityLink from '../../_partials/community_link.mdx'; + +# Run Vantage Express on VirtualBox + + + +## Prerequisites + +1. A computer using one of the following operating systems: Windows 10, Linux or Intel-based MacOS. + +:::note +For M1/M2 MacOS systems, see xref:getting.started.utm.adoc[]. +::: + +2. 30GB of disk space and enough CPU and RAM to be able to dedicate at least one core and 6GB RAM to the virtual machine. +3. Admin rights to be able to install and run the software. + + +## Installation + +### Download required software + +1. The latest version of [Vantage Express VirtualBox Open Virtual Appliance (OVA)](https://downloads.teradata.com/download/database/teradata-express-for-vmware-player). + +:::note +If you have not used the Teradata Downloads website before, you will need to register first. +::: + +2. [VirtualBox](https://www.virtualbox.org/wiki/Downloads), version 6.1. + +:::note +You can also install VirtualBox using `brew` and other package managers. +::: + +### Run installers + +1. Install VirtualBox by running the installer and accepting the default values. + +:::note +VirtualBox includes functionality that requires elevated privileges. When you start VirtualBox for the first time, you will be asked to confirm this elevated access. You may also need to reboot your machine to activate the VirtualBox kernel plugin. +::: + +### Run Vantage Express + +1. Start VirtualBox. +2. Go to `File -> Import Appliance...` menu. +3. In `File` field, select the downloaded OVA file. +4. On the next screen, accept the defaults and click on `Import`. +5. Back in the main VirtualBox panel, start the Vantage Express appliance double clicking on VM `Vantage 17.20`. + +![Start VM](../../images/getting-started-vbox/start-vm.png) + + + + + +## Updating VirtualBox Guest Extensions + +VirtualBox Guest Extensions is a piece of software that runs in a VM. It makes the VM run faster on VirtualBox. It also improves the resolution of the VM screen and its responsiveness to resizing. It implements two-way clipboard, and drag and drop between the host and the guest. VirtualBox Guest Extensions in the VM needs to match the version of your VirtualBox install. You will likely have to update VirtualBox Guest Extensions for optimal performance. + +To update VirtualBox Guest Extensions: + +. Insert the VirtualBox Guest Extensions DVD by clicking on `SATA Port 3: [Optical Drive]` in `Storage` section: + +![Insert Guest Additions DVD](../../images/insert-guest-additions-dvd.png) + +. Back in the VM window, start the `Gnome Terminal` application. +. Run the following command in the terminal: + +``` +mount /dev/cdrom /media/dvd; /media/dvd/VBoxLinuxAdditions.run +``` + + + +## Next steps +* [Query data stored in object storage](../../manage-data/nos.md) + +## Further reading +* [Teradata® Studio™ and Studio™ Express Installation Guide](https://docs.teradata.com/r/Teradata-StudioTM-and-StudioTM-Express-Installation-Guide-17.20) + + \ No newline at end of file diff --git a/quickstarts/get-access-to-vantage/on-your-local/getting-started-vmware.md b/quickstarts/get-access-to-vantage/on-your-local/getting-started-vmware.md new file mode 100644 index 0000000000..f427bab4ae --- /dev/null +++ b/quickstarts/get-access-to-vantage/on-your-local/getting-started-vmware.md @@ -0,0 +1,67 @@ +--- +sidebar_position: 1 +author: Adam Tworkiewicz +email: adam.tworkiewicz@teradata.com +page_last_update: January 9th, 2023 +description: Run Vantage Express on your laptop using VMware. +keywords: [data warehouses, compute storage separation, teradata, vantage, cloud data platform, object storage, business intelligence, enterprise analytics] +--- +import GettingStartedIntro from '../../_partials/getting-started-intro.mdx'; +import RunVantage from '../../_partials/run-vantage.mdx'; +import RunSimpleQueries from '../../_partials/running-sample-queries.mdx'; +import GettingStartedSummary from '../../_partials/getting-started-summary.mdx'; +import CommunityLink from '../../_partials/community_link.mdx'; + +# Run Vantage Express on VMware + + + +## Prerequisites + +1. A computer using one of the following operating systems: Windows, Linux or Intel-based MacOS. +:::note +For M1/M2 MacOS systems, see [Run Vantage Express on UTM.](./getting-started-utm.md). +::: +2. 30GB of disk space and enough CPU and RAM to be able to dedicate at least one core and 6GB RAM to the virtual machine. +3. Admin rights to be able to install and run the software. + +## Installation + +### Download required software + +* The latest version of [Vantage Express](https://downloads.teradata.com/download/database/teradata-express-for-vmware-player). If you have not used the Teradata downloads website before, you will need to register. +* [VMware Workstation Player](https://www.vmware.com/products/workstation-player.html). + +:::important +Commercial organizations require commercial licenses to use VMware Workstation Player. If you don't want to acquire VMware licenses you can run Vantage Express on [VirtualBox](xref:getting.started.vbox.adoc). +::: + +:::important +VMware doesn't offer VMware Workstation Player for MacOS. If you are on a Mac, you will need to install [VMware Fusion](https://www.vmware.com/products/fusion/fusion-evaluation.html) instead. It's a paid product but VMware offers a free 30-day trial. Alternatively, you can run Vantage Express on [VirtualBox](./getting-started-vbox.md) or [UTM](./getting-started-utm.md). +::: +* On Windows, you will also need [7zip](https://www.7-zip.org/download.html) to unzip Vantage Express. + +### Run installers + +1. Install VMware Player or VMware Fusion by running the installer and accepting the default values. +2. If on Windows, install `7zip`. + +### Run Vantage Express + +1. Go to the directory where you downloaded Vantage Express and unzip the downloaded file. +2. Double-click on the `.vmx` file. This will start the VM image in VMware Player/Fusion. +3. Press ENTER to select the highlighted `LINUX` boot partition. + + + + + + + +## Next steps +* [Query data stored in object storage](../../manage-data/nos.md) + +## Further reading +* [Teradata® Studio™ and Studio™ Express Installation Guide](https://docs.teradata.com/r/Teradata-StudioTM-and-StudioTM-Express-Installation-Guide-17.20) + + \ No newline at end of file diff --git a/quickstarts/images/VantageCloud.png b/quickstarts/images/VantageCloud.png new file mode 100644 index 0000000000..064202e68a Binary files /dev/null and b/quickstarts/images/VantageCloud.png differ diff --git a/quickstarts/images/advanceddbt1.svg b/quickstarts/images/advanceddbt1.svg new file mode 100644 index 0000000000..7b790a22c8 --- /dev/null +++ b/quickstarts/images/advanceddbt1.svg @@ -0,0 +1,121 @@ + + + + + + + + + +customers + + +customers + + +id   + [int] + + +name   + [varchar] + + +surname   + [varchar] + + +email   + [varchar] + + + +orders + + +orders + + +id   + [int] + + +customer_id   + [int] + + +order_date   + [varchar] + + +status   + [varchar] + + + +customers--orders + +0..N +1 + + + +order_products + + +order_products + + +order_id    + [int] + + +product_id   + [int] + + +quantity   + [int] + + + +orders--order_products + +0..N +1 + + + +products + + +products + + +id   + [int] + + +name   + [varchar] + + +category   + [varchar] + + +unit_price   + [varchar] + + + +products--order_products + +0..N +1 + + + diff --git a/quickstarts/images/advanceddbt2.svg b/quickstarts/images/advanceddbt2.svg new file mode 100644 index 0000000000..2b444d4bcc --- /dev/null +++ b/quickstarts/images/advanceddbt2.svg @@ -0,0 +1,133 @@ + + + + + + + + + +dim_customers + + +dim_customers + + +customer_id   + [int] + + +first_name   + [varchar] + + +last_name   + [varchar] + + +email   + [varchar] + + + +fct_order_details + + +fct_order_details + + +order_id   + [int] + + +product_id   + [int] + + +customer_id   + [int] + + +order_date   + [varchar] + + +unit_price   + [varchar] + + +quantity   + [int] + + +amount   + [varchar] + + + +dim_customers--fct_order_details + +0..N +1 + + + +dim_orders + + +dim_orders + + +order_id   + [int] + + +order_date   + [varchar] + + +order_status   + [varchar] + + + +dim_orders--fct_order_details + +0..N +1 + + + +dim_products + + +dim_products + + +product_id   + [int] + + +product_name   + [varchar] + + +product_category   + [varchar] + + +price_dollars   + [varchar] + + + +dim_products--fct_order_details + +0..N +1 + + + diff --git a/quickstarts/images/anypoint.import.projects.png b/quickstarts/images/anypoint.import.projects.png new file mode 100644 index 0000000000..ba9e77a143 Binary files /dev/null and b/quickstarts/images/anypoint.import.projects.png differ diff --git a/quickstarts/images/banking.model.png b/quickstarts/images/banking.model.png new file mode 100644 index 0000000000..a3cc6b501a Binary files /dev/null and b/quickstarts/images/banking.model.png differ diff --git a/quickstarts/images/browser.copy.curl.png b/quickstarts/images/browser.copy.curl.png new file mode 100644 index 0000000000..71c085f199 Binary files /dev/null and b/quickstarts/images/browser.copy.curl.png differ diff --git a/quickstarts/images/browser.network.png b/quickstarts/images/browser.network.png new file mode 100644 index 0000000000..06cb7985af Binary files /dev/null and b/quickstarts/images/browser.network.png differ diff --git a/quickstarts/images/csae_create_env.png b/quickstarts/images/csae_create_env.png new file mode 100644 index 0000000000..c1ffb091f0 Binary files /dev/null and b/quickstarts/images/csae_create_env.png differ diff --git a/quickstarts/images/csae_env_details.png b/quickstarts/images/csae_env_details.png new file mode 100644 index 0000000000..7de15e9bd3 Binary files /dev/null and b/quickstarts/images/csae_env_details.png differ diff --git a/quickstarts/images/csae_env_params.png b/quickstarts/images/csae_env_params.png new file mode 100644 index 0000000000..6fc8b538d5 Binary files /dev/null and b/quickstarts/images/csae_env_params.png differ diff --git a/quickstarts/images/csae_jupyter.png b/quickstarts/images/csae_jupyter.png new file mode 100644 index 0000000000..e392a16e3e Binary files /dev/null and b/quickstarts/images/csae_jupyter.png differ diff --git a/quickstarts/images/csae_register.png b/quickstarts/images/csae_register.png new file mode 100644 index 0000000000..ea3c5a0a3b Binary files /dev/null and b/quickstarts/images/csae_register.png differ diff --git a/quickstarts/images/csae_signin.png b/quickstarts/images/csae_signin.png new file mode 100644 index 0000000000..cd8b1cc87f Binary files /dev/null and b/quickstarts/images/csae_signin.png differ diff --git a/quickstarts/images/dbt1.svg b/quickstarts/images/dbt1.svg new file mode 100644 index 0000000000..5404075f63 --- /dev/null +++ b/quickstarts/images/dbt1.svg @@ -0,0 +1,95 @@ + + + + + + + + + +customers + + +customers + + +id   + [int] + + +first_name   + [varchar] + + +last_name   + [varchar] + + +email   + [varchar] + + + +orders + + +orders + + +id   + [int] + + +user_id   + [int] + + +order_date   + [date] + + +status   + [varchar] + + + +customers--orders + +0..N +1 + + + +payments + + +payments + + +id   + [int] + + +order_id   + [int] + + +payment_method   + [int] + + +amount   + [int] + + + +orders--payments + +0..N +1 + + + diff --git a/quickstarts/images/dbt2.svg b/quickstarts/images/dbt2.svg new file mode 100644 index 0000000000..d17680358c --- /dev/null +++ b/quickstarts/images/dbt2.svg @@ -0,0 +1,101 @@ + + + + + + + + + +dimension: customers + + +dimension: customers + + +customer_id   + [int] + + +first_name   + [varchar] + + +last_name   + [varchar] + + +email   + [varchar] + + +first_order   + [date] + + +most_recent_order   + [date] + + +number_of_orders   + [int] + + +total_order_amount   + [int] + + + +fact: orders + + +fact: orders + + +order_id   + [int] + + +customer_id   + [int] + + +order_date   + [date] + + +status   + [varchar] + + +amount   + [int] + + +credit_card_amount   + [int] + + +coupon_amount   + [int] + + +bank_transfer_amount   + [int] + + +gift_card_amount   + [int] + + + +dimension: customers--fact: orders + +0..N +1 + + + diff --git a/quickstarts/images/dbt3.svg b/quickstarts/images/dbt3.svg new file mode 100644 index 0000000000..558a07b7c2 --- /dev/null +++ b/quickstarts/images/dbt3.svg @@ -0,0 +1,179 @@ + + + + + + + + + +raw_customers + + +raw_customers + + +cust_id   + [INTEGER] + + +income   + [DECIMAL(15, 1)] + + +age   + [INTEGER] + + +years_with_bank   + [INTEGER] + + +nbr_children   + [INTEGER] + + +gender   + [VARCHAR(1)] + + +marital_status   + [VARCHAR(1)] + + +name_prefix   + [VARCHAR(4)] + + +first_name   + [VARCHAR(12)] + + +last_name   + [VARCHAR(15)] + + +street_nbr   + [VARCHAR(8)] + + +street_name   + [VARCHAR(15)] + + +postal_code   + [VARCHAR(5)] + + +city_name   + [VARCHAR(16)] + + +state_code   + [VARCHAR(2)] + + + +raw_accounts + + +raw_accounts + + +acct_nbr   + [VARCHAR(18)] + + +cust_id   + [INTEGER] + + +acct_type   + [VARCHAR(2)] + + +account_active   + [VARCHAR(1)] + + +acct_start_date   + [DATE] + + +acct_end_date   + [DATE] + + +starting_balance   + [DECIMAL(11, 3)] + + +ending_balance   + [DECIMAL(11, 3)] + + + +raw_customers--raw_accounts + +0..N +1 + + + +raw_transactions + + +raw_transactions + + +tran_id   + [INTEGER] + + +acct_nbr   + [VARCHAR(18)] + + +tran_amt   + [DECIMAL(9, 2)] + + +principal_amt   + [DECIMAL(15, 2)] + + +interest_amt   + [DECIMAL(11, 3)] + + +new_balance   + [DECIMAL(9, 2)] + + +tran_date   + [DATE] + + +tran_time   + [INTEGER] + + +channel   + [VARCHAR(1)] + + +tran_code   + [VARCHAR(2)] + + + +raw_accounts--raw_transactions + +0..N +1 + + + diff --git a/quickstarts/images/dbt4.png b/quickstarts/images/dbt4.png new file mode 100644 index 0000000000..dd7d8a8d60 Binary files /dev/null and b/quickstarts/images/dbt4.png differ diff --git a/quickstarts/images/dita.svg b/quickstarts/images/dita.svg new file mode 100644 index 0000000000..8bdd610b7c --- /dev/null +++ b/quickstarts/images/dita.svg @@ -0,0 +1 @@ +JSON TransformationRaw JSON DataNormalized ViewsDimensional ModelingDimensionandFact Tables \ No newline at end of file diff --git a/quickstarts/images/flow.png b/quickstarts/images/flow.png new file mode 100644 index 0000000000..4f5a69cb61 Binary files /dev/null and b/quickstarts/images/flow.png differ diff --git a/quickstarts/images/getting-started-vbox/start-vm.png b/quickstarts/images/getting-started-vbox/start-vm.png new file mode 100644 index 0000000000..eccba90045 Binary files /dev/null and b/quickstarts/images/getting-started-vbox/start-vm.png differ diff --git a/quickstarts/images/gettingstarteddemo.ipynb.png b/quickstarts/images/gettingstarteddemo.ipynb.png new file mode 100644 index 0000000000..d81bb0a7fa Binary files /dev/null and b/quickstarts/images/gettingstarteddemo.ipynb.png differ diff --git a/quickstarts/images/insert-guest-additions-dvd.png b/quickstarts/images/insert-guest-additions-dvd.png new file mode 100644 index 0000000000..6426d36eac Binary files /dev/null and b/quickstarts/images/insert-guest-additions-dvd.png differ diff --git a/quickstarts/images/joined_table_ml.png b/quickstarts/images/joined_table_ml.png new file mode 100644 index 0000000000..5d6415fd0c Binary files /dev/null and b/quickstarts/images/joined_table_ml.png differ diff --git a/quickstarts/images/lake_advanced_option.png b/quickstarts/images/lake_advanced_option.png new file mode 100644 index 0000000000..fa817c9a45 Binary files /dev/null and b/quickstarts/images/lake_advanced_option.png differ diff --git a/quickstarts/images/lake_advanced_option_default.png b/quickstarts/images/lake_advanced_option_default.png new file mode 100644 index 0000000000..bae28cfd60 Binary files /dev/null and b/quickstarts/images/lake_advanced_option_default.png differ diff --git a/quickstarts/images/lake_available_environment.png b/quickstarts/images/lake_available_environment.png new file mode 100644 index 0000000000..b3df7d4945 Binary files /dev/null and b/quickstarts/images/lake_available_environment.png differ diff --git a/quickstarts/images/lake_create_environment.png b/quickstarts/images/lake_create_environment.png new file mode 100644 index 0000000000..7eb180cbd1 Binary files /dev/null and b/quickstarts/images/lake_create_environment.png differ diff --git a/quickstarts/images/lake_database_cred.png b/quickstarts/images/lake_database_cred.png new file mode 100644 index 0000000000..20c3da6ba2 Binary files /dev/null and b/quickstarts/images/lake_database_cred.png differ diff --git a/quickstarts/images/lake_environment_configuration.png b/quickstarts/images/lake_environment_configuration.png new file mode 100644 index 0000000000..a2e1a95d2a Binary files /dev/null and b/quickstarts/images/lake_environment_configuration.png differ diff --git a/quickstarts/images/lake_environment_page.png b/quickstarts/images/lake_environment_page.png new file mode 100644 index 0000000000..5e538b52b7 Binary files /dev/null and b/quickstarts/images/lake_environment_page.png differ diff --git a/quickstarts/images/lake_expanded_menu.png b/quickstarts/images/lake_expanded_menu.png new file mode 100644 index 0000000000..d1c3ad8f29 Binary files /dev/null and b/quickstarts/images/lake_expanded_menu.png differ diff --git a/quickstarts/images/lake_ip_addresses.png b/quickstarts/images/lake_ip_addresses.png new file mode 100644 index 0000000000..a018bc0f9f Binary files /dev/null and b/quickstarts/images/lake_ip_addresses.png differ diff --git a/quickstarts/images/lake_primary_cluster_config.png b/quickstarts/images/lake_primary_cluster_config.png new file mode 100644 index 0000000000..aea344b9fd Binary files /dev/null and b/quickstarts/images/lake_primary_cluster_config.png differ diff --git a/quickstarts/images/lake_public_internet_cv.png b/quickstarts/images/lake_public_internet_cv.png new file mode 100644 index 0000000000..03078aaa3c Binary files /dev/null and b/quickstarts/images/lake_public_internet_cv.png differ diff --git a/quickstarts/images/lake_settings_menu.png b/quickstarts/images/lake_settings_menu.png new file mode 100644 index 0000000000..52cf40777d Binary files /dev/null and b/quickstarts/images/lake_settings_menu.png differ diff --git a/quickstarts/images/lake_sign_on.png b/quickstarts/images/lake_sign_on.png new file mode 100644 index 0000000000..8ce86f265d Binary files /dev/null and b/quickstarts/images/lake_sign_on.png differ diff --git a/quickstarts/images/lake_welcome_page.png b/quickstarts/images/lake_welcome_page.png new file mode 100644 index 0000000000..c3f4699551 Binary files /dev/null and b/quickstarts/images/lake_welcome_page.png differ diff --git a/quickstarts/images/ml_gender_hot_encoded.png b/quickstarts/images/ml_gender_hot_encoded.png new file mode 100644 index 0000000000..b0021c18f8 Binary files /dev/null and b/quickstarts/images/ml_gender_hot_encoded.png differ diff --git a/quickstarts/images/ml_model_evaluated.png b/quickstarts/images/ml_model_evaluated.png new file mode 100644 index 0000000000..32e1db8eb5 Binary files /dev/null and b/quickstarts/images/ml_model_evaluated.png differ diff --git a/quickstarts/images/ml_model_scored.png b/quickstarts/images/ml_model_scored.png new file mode 100644 index 0000000000..878a56a615 Binary files /dev/null and b/quickstarts/images/ml_model_scored.png differ diff --git a/quickstarts/images/ml_model_trained.png b/quickstarts/images/ml_model_trained.png new file mode 100644 index 0000000000..3b5aedf9a3 Binary files /dev/null and b/quickstarts/images/ml_model_trained.png differ diff --git a/quickstarts/images/ml_tot_income_scaled.png b/quickstarts/images/ml_tot_income_scaled.png new file mode 100644 index 0000000000..569e13b26b Binary files /dev/null and b/quickstarts/images/ml_tot_income_scaled.png differ diff --git a/quickstarts/images/ml_train_col.png b/quickstarts/images/ml_train_col.png new file mode 100644 index 0000000000..7cfe695288 Binary files /dev/null and b/quickstarts/images/ml_train_col.png differ diff --git a/quickstarts/images/run-vantage/boot-manager-menu.png b/quickstarts/images/run-vantage/boot-manager-menu.png new file mode 100644 index 0000000000..f564cdcb0d Binary files /dev/null and b/quickstarts/images/run-vantage/boot-manager-menu.png differ diff --git a/quickstarts/images/run-vantage/grub-menu.png b/quickstarts/images/run-vantage/grub-menu.png new file mode 100644 index 0000000000..b2ca241a05 Binary files /dev/null and b/quickstarts/images/run-vantage/grub-menu.png differ diff --git a/quickstarts/images/run-vantage/new.connection.png b/quickstarts/images/run-vantage/new.connection.png new file mode 100644 index 0000000000..1c16cf5044 Binary files /dev/null and b/quickstarts/images/run-vantage/new.connection.png differ diff --git a/quickstarts/images/run-vantage/new.connection.profile.png b/quickstarts/images/run-vantage/new.connection.profile.png new file mode 100644 index 0000000000..45f6552810 Binary files /dev/null and b/quickstarts/images/run-vantage/new.connection.profile.png differ diff --git a/quickstarts/images/run-vantage/okay-the-security-popup.png b/quickstarts/images/run-vantage/okay-the-security-popup.png new file mode 100644 index 0000000000..1e9fbaaaa6 Binary files /dev/null and b/quickstarts/images/run-vantage/okay-the-security-popup.png differ diff --git a/quickstarts/images/run-vantage/start-gnome-terminal.png b/quickstarts/images/run-vantage/start-gnome-terminal.png new file mode 100644 index 0000000000..68dad8eba4 Binary files /dev/null and b/quickstarts/images/run-vantage/start-gnome-terminal.png differ diff --git a/quickstarts/images/run-vantage/start-teradata-studio-express.png b/quickstarts/images/run-vantage/start-teradata-studio-express.png new file mode 100644 index 0000000000..7fc230ecb7 Binary files /dev/null and b/quickstarts/images/run-vantage/start-teradata-studio-express.png differ diff --git a/quickstarts/images/run-vantage/vm.login.png b/quickstarts/images/run-vantage/vm.login.png new file mode 100644 index 0000000000..8177ead596 Binary files /dev/null and b/quickstarts/images/run-vantage/vm.login.png differ diff --git a/quickstarts/images/run-vantage/wait-for-gui.png b/quickstarts/images/run-vantage/wait-for-gui.png new file mode 100644 index 0000000000..d93247832e Binary files /dev/null and b/quickstarts/images/run-vantage/wait-for-gui.png differ diff --git a/quickstarts/images/segment.flow.diagram.png b/quickstarts/images/segment.flow.diagram.png new file mode 100644 index 0000000000..bb241e11d9 Binary files /dev/null and b/quickstarts/images/segment.flow.diagram.png differ diff --git a/quickstarts/images/select.import.option.png b/quickstarts/images/select.import.option.png new file mode 100644 index 0000000000..545fd77294 Binary files /dev/null and b/quickstarts/images/select.import.option.png differ diff --git a/quickstarts/images/utm.drives.png b/quickstarts/images/utm.drives.png new file mode 100644 index 0000000000..0178b8b53d Binary files /dev/null and b/quickstarts/images/utm.drives.png differ diff --git a/quickstarts/images/utm.final.png b/quickstarts/images/utm.final.png new file mode 100644 index 0000000000..78d7511d34 Binary files /dev/null and b/quickstarts/images/utm.final.png differ diff --git a/quickstarts/images/utm.hardware.png b/quickstarts/images/utm.hardware.png new file mode 100644 index 0000000000..a2b8b64e47 Binary files /dev/null and b/quickstarts/images/utm.hardware.png differ diff --git a/quickstarts/images/utm.network.png b/quickstarts/images/utm.network.png new file mode 100644 index 0000000000..96607e08d0 Binary files /dev/null and b/quickstarts/images/utm.network.png differ diff --git a/quickstarts/introduction/_category_.json b/quickstarts/introduction/_category_.json new file mode 100644 index 0000000000..21ab3c73fc --- /dev/null +++ b/quickstarts/introduction/_category_.json @@ -0,0 +1,4 @@ +{ + "label": "Introduction", + "position": 1 + } \ No newline at end of file diff --git a/quickstarts/introduction/images/teradata-vantage-architecture-concepts/teradata_architecture_major_components.png b/quickstarts/introduction/images/teradata-vantage-architecture-concepts/teradata_architecture_major_components.png new file mode 100644 index 0000000000..a7f1d69b0e Binary files /dev/null and b/quickstarts/introduction/images/teradata-vantage-architecture-concepts/teradata_architecture_major_components.png differ diff --git a/quickstarts/introduction/images/teradata-vantage-architecture-concepts/teradata_data_distribution.png b/quickstarts/introduction/images/teradata-vantage-architecture-concepts/teradata_data_distribution.png new file mode 100644 index 0000000000..d7dea8873b Binary files /dev/null and b/quickstarts/introduction/images/teradata-vantage-architecture-concepts/teradata_data_distribution.png differ diff --git a/quickstarts/introduction/images/teradata-vantage-architecture-concepts/teradata_parallelism.png b/quickstarts/introduction/images/teradata-vantage-architecture-concepts/teradata_parallelism.png new file mode 100644 index 0000000000..05b2524bb9 Binary files /dev/null and b/quickstarts/introduction/images/teradata-vantage-architecture-concepts/teradata_parallelism.png differ diff --git a/quickstarts/introduction/images/teradata-vantage-architecture-concepts/teradata_retrieval_architecture.png b/quickstarts/introduction/images/teradata-vantage-architecture-concepts/teradata_retrieval_architecture.png new file mode 100644 index 0000000000..c043e6d9c4 Binary files /dev/null and b/quickstarts/introduction/images/teradata-vantage-architecture-concepts/teradata_retrieval_architecture.png differ diff --git a/quickstarts/introduction/teradata-vantage-engine-architecture-and-concepts.md b/quickstarts/introduction/teradata-vantage-engine-architecture-and-concepts.md new file mode 100644 index 0000000000..e5173e8be8 --- /dev/null +++ b/quickstarts/introduction/teradata-vantage-engine-architecture-and-concepts.md @@ -0,0 +1,114 @@ +--- +id: teradata-vantage-engine-architecture-and-concepts +title: Teradata Vantage Engine Architecture and Concepts +sidebar_label: Teradata Vantage Engine Architecture and Concepts +sidebar_position: 1 +author: Krutik Pathak +email: krutik.pathak@teradata.com +page_last_update: August 7th, 2023 +description: Teradata Vantage Architecture and Concepts. +keywords: [data warehouses, teradata vantage engine architecture, teradata, vantage, Access Module Processors (AMP), Parsing Engines (PE), Massively Parallel Processing (MPP), Virtual Disk (Vdisks), BYNET] +--- + +# Teradata Vantage Engine Architecture and Concepts + +### Overview + +This article explains the underlying concepts of Teradata Vantage engine architecture. All editions of Vantage, including the Primary Cluster in VantageCloud Lake utilize the same engine. + +Teradata's architecture is designed around a Massively Parallel Processing (MPP), shared-nothing architecture, which enables high-performance data processing and analytics. The MPP architecture distributes the workload into multiple vprocs or virtual processors. The virtual processor where query processing takes place is commonly referred to as an Access Module Processor (AMP). Each AMP is isolated from other AMPs, and processes the queries in parallel allowing Teradata to process large volumes of data rapidly. + +The major architectural components of the Teradata Vantage engine include the Parsing Engines (PEs), BYNET, Access Module Processors (AMPs), and Virtual Disks (Vdisks). Vdisks are assigned to AMPs in enterprise platforms, and to the Primary Cluster in the case of VantageCloud Lake environments. + +![Teradata Vantage Major Architectural Components](./images/teradata-vantage-architecture-concepts/teradata_architecture_major_components.png) + +## Teradata Vantage Engine Architecture Components +The Teradata Vantage engine consists of the components below: + +### Parsing Engines (PE) +When a SQL query is run in Teradata, it first reaches the Parsing Engine. The functions of the Parsing Engine are: + +* Manage individual user sessions (up to 120). +* Check if the objects used in the SQL query exist. +* Check if the user has required privileges against the objects used in the SQL query. +* Parse and optimize the SQL queries. +* Prepare the execution plan to execute the SQL query and passes it to the corresponding AMPs. +* Receive the response from the AMPs and send it back to the requesting client. + + +### BYNET +BYNET is a system that enables component communication. The BYNET system provides high-speed bi-directional broadcast, multicast, and point-to-point communication and merge functions. It performs three key functions: coordinating multi-AMP queries, reading data from multiple AMPs, regulating message flow to prevent congestion, and processing platform throughput. These functions of BYNET make Vantage highly scalable and enable Massively Parallel Processing (MPP) capabilities. + +### Parallel Database Extension (PDE) +Parallel Database Extension (PDE) is an intermediary software layer positioned between the operating system and the Teradata Vantage database. PDE enables MPP systems to use features such as BYNET and shared disks. It facilitates the parallelism that is responsible for the speed and linear scalability of the Teradata Vantage database. + +### Access Module Processor (AMP) +AMPs are responsible for data storage and retrieval. Each AMP is associated with its own set of Virtual Disks (Vdisks) where the data is stored, and no other AMP can access that content in line with the shared-nothing architecture. The functions of AMP are: + +* Access storage using Vantage’s Block File System Software +* Lock management +* Sorting rows +* Aggregating columns +* Join processing +* Output conversion +* Disk space management +* Accounting +* Recovery processing + +:::note +AMPs in VantageCore IntelliFlex, VantageCore VMware, VantageCloud Enterprise, and the Primary Cluster in the case of VantageCloud Lake, store data in a Block File System (BFS) format on Vdisks. AMPs in Compute Clusters and Compute Worker Nodes on VantageCloud Lake do not have BFS, they can only access data in object storage using the Object File System (OFS). +::: + +### Virtual Disks (Vdisks) +These are units of storage space owned by an AMP. Virtual Disks are used to hold user data (rows within tables). Virtual Disks map to physical space on a disk. + +### Node +A node, in the context of Teradata systems, represents an individual server that functions as a hardware platform for the database software. It serves as a processing unit where database operations are executed under the control of a single operating system. When Teradata is deployed in a cloud, it follows the same MPP, shared-nothing architecture but the physical nodes are replaced with virtual machines (VMs). + +## Teradata Vantage Architecture Concepts +The concepts below are applicable to Teradata Vantage. + +### Linear Growth and Expandability +Teradata is a linearly expandable RDBMS. As the workload and data volume increase, adding more hardware resources such as servers or nodes results in a proportional increase in performance and capacity. Linear Scalability allows for increased workload without decreased throughput. + +### Teradata Parallelism +Teradata parallelism refers to the inherent ability of the Teradata Database to perform parallel processing of data and queries across multiple nodes or components simultaneously.  + +* Each Parsing Engine (PE) in Teradata has the capability to handle up to 120 sessions concurrently. +* The BYNET in Teradata enables parallel handling of all message activity, including data redistribution for subsequent tasks. +* All Access Module Processors (AMPs) in Teradata can collaborate in parallel to serve any incoming request. +* Each AMP can work on multiple requests concurrently, allowing for efficient parallel processing. + +![Teradata Parallelism](./images/teradata-vantage-architecture-concepts/teradata_parallelism.png) + + +### Teradata Retrieval Architecture +The key steps involved in Teradata Retrieval Architecture are: + +1. The Parsing Engine sends a request to retrieve one or more rows. +2. The BYNET activates the relevant AMP(s) for processing. +3. The AMP(s) concurrently locate and retrieve the desired row(s) through parallel access. +4. The BYNET returns the retrieved row(s) to the Parsing Engine. +5. The Parsing Engine then delivers the row(s) back to the requesting client application. + +![Teradata Retrieval Architecture](./images/teradata-vantage-architecture-concepts/teradata_retrieval_architecture.png) + +### Teradata Data Distribution +Teradata's MPP architecture requires an efficient means of distributing and retrieving data and does so using hash partitioning. Most tables in Vantage use hashing to distribute data for the tables based on the value of the row’s Primary Index (PI) to disk storage in Block File System (BFS) and may scan the entire table or use indexes to access the data. This approach ensures scalable performance and efficient data access. + +* If the Primary Index is unique then the rows in the tables are automatically distributed evenly by hash partitioning. +* The designated Primary Index column(s) are hashed to generate consistent hash codes for the same values. +* No reorganization, repartitioning, or space management is required. +* Each AMP typically contains rows from all tables, ensuring efficient data access and processing. + +![Teradata Data Distribution](./images/teradata-vantage-architecture-concepts/teradata_data_distribution.png) + +## Conclusion +In this article, we covered the major architectural components of Teradata Vantage, such as the Parsing Engines (PEs), BYNET, Access Module Processors (AMPs), Virtual Disk (Vdisk), other architectural components such as Parallel Database Extension (PDE), Node and the essential concepts of Teradata Vantage such as Linear Growth and Expandability, Parallelism, Data Retrieval, and Data Distribution. + +## Further Reading +* [Parsing Engine](https://docs.teradata.com/r/Enterprise_IntelliFlex_VMware/Database-Introduction/Vantage-Hardware-and-Software-Architecture/Virtual-Processors/Parsing-Engine) +* [BYNET](https://www.teradata.com/Blogs/What-Is-the-BYNET-and-Why-Is-It-Important-to-Vantage) +* [Access Module Processor](https://docs.teradata.com/r/Enterprise_IntelliFlex_VMware/Database-Introduction/Vantage-Hardware-and-Software-Architecture/Virtual-Processors/Access-Module-Processor) +* [Parallel Database Extensions](https://docs.teradata.com/r/Enterprise_IntelliFlex_VMware/Database-Introduction/Vantage-Hardware-and-Software-Architecture/Parallel-Database-Extensions) +* [Teradata Data Distribution and Data Access Methods](https://docs.teradata.com/r/Enterprise_IntelliFlex_VMware/Database-Introduction/Data-Distribution-and-Data-Access-Methods) \ No newline at end of file diff --git a/quickstarts/manage-data/_category_.json b/quickstarts/manage-data/_category_.json new file mode 100644 index 0000000000..c3ca27eadd --- /dev/null +++ b/quickstarts/manage-data/_category_.json @@ -0,0 +1,4 @@ +{ + "label": "Manage data", + "position": 4 + } \ No newline at end of file diff --git a/quickstarts/manage-data/advanced-dbt.md b/quickstarts/manage-data/advanced-dbt.md new file mode 100644 index 0000000000..8de2dbeb4e --- /dev/null +++ b/quickstarts/manage-data/advanced-dbt.md @@ -0,0 +1,215 @@ +--- +sidebar_position: 5 +id: advanced-dbt +title: Advanced dbt use cases with Teradata Vantage +author: Daniel Herrera +email: daniel.herrera2@teradata.com +page_last_update: May 22th, 2023 +description: Advanced dbt (data build tool) use cases with Teradata Vantage. +keywords: [data warehouses, compute storage separation, teradata, vantage, cloud data platform, object storage, business intelligence, enterprise analytics, elt, dbt.] +--- + +import ClearscapeDocsNote from '../_partials/vantage_clearscape_analytics.mdx' +import CommunityLink from '../_partials/community_link.mdx' + +# Advanced dbt use cases with Teradata Vantage + +## Overview + +This project showcases the integration of dbt with Teradata Vantage from an advanced user perspective. +If you are new to data engineering with dbt we recommend that you start with our [introductory project.](https://quickstarts.teradata.com/dbt.html) + +The advanced use cases showcased in the demo are the following: + +* Incremental materializations +* Utility macros +* Optimizing table/view creations with Teradata-specific modifiers + +The application of these concepts is illustrated through the ELT process of `teddy_retailers`, a fictional store. + +## Prerequisites + +* Access to a Teradata Vantage instance. + + + +* Python *3.7*, *3.8*, *3.9* or *3.10* installed. + +* A database client for running database commands, an example of the configuration of one such client is presented in [this tutorial.](https://quickstarts.teradata.com/other-integrations/configure-a-teradata-vantage-connection-in-dbeaver.html). + +## Demo project setup + +1. Clone the tutorial repository and cd into the project directory: +```bash +git clone https://github.com/Teradata/teddy_retailers_dbt-dev teddy_retailers +cd teddy_retailers +``` + +2. Create a new python environment to manage dbt and its dependencies. Confirm that the Python Version you are using to create the environment is within the supported versions listed above. +```bash +python -m venv env +``` + +3. Activate the python environment according to your operating system. +```bash +source env/bin/activate +``` + +for Mac, Linux, or +```bash +env\Scripts\activate +``` +for Windows +4. Install the `dbt-teradata` module. The core dbt module is included as a dependency so you don't have to install it separately: + +```bash +pip install dbt-teradata +``` + +5. Install the project's dependencies `dbt-utils` and `teradata-utils`. This can be done through the following command: + +```bash +dbt deps +``` + +## Data warehouse setup + +The demo project assumes that the source data is already loaded into your data warehouse, this mimics the way that dbt is used in a production environment. +To achieve this objective we provide public datasets available in Google Cload Platform (GCP), and scripts to load those datasets into your mock data warehouse. + + +1. Create or select a working database. The dbt profile in the project points to a database called `teddy_retailers`. You can change the `schema` value to point to an existing database in your Teradata Vantage instance or you can create the `teddy_retailers` database running the following script in your database client: +```sql +CREATE DATABASE teddy_retailers +AS PERMANENT = 110e6, + SPOOL = 220e6; +``` +2. Load Initial data set. +To load the initial data set into the data warehouse, the required scripts are available in the `references/inserts/create_data.sql` path of the project. +You can execute these scripts by copying and pasting them into your database client. For guidance on running these scripts in your specific case please consult your database client's documentation. + +## Configure dbt + +We will now configure dbt to connect to your Vantage database. +Create the file `$HOME/.dbt/profiles.yml` with the following content. Adjust ``, ``, `` to match your Teradata Vantage instance. +If you have already used dbt before in your environment you only need to add a profile for the project in your home's directory `.dbt/profiles.yml` file. +If the directory .dbt doesn't exist in your system yet you will need to create it and add the profiles.yml to manage your dbt profiles. + + +```bash +teddy_retailers: + outputs: + dev: + type: teradata + host: + user: + password: + logmech: TD2 + schema: teddy_retailers + tmode: ANSI + threads: 1 + timeout_seconds: 300 + priority: interactive + retries: 1 + target: dev +``` + +Now, that we have the profile file in place, we can validate the setup: + +```bash +dbt debug +``` + +If the debug command returned errors, you likely have an issue with the content of `profiles.yml`. + +## About the Teddy Retailers warehouse + +As mentioned, `teddy_retailers` is a fictional store. +Through dbt driven transformations we transform source data ingested from the`teddy_retailers` transactional database into a star schema ready for analytics. + +### The data models + +The source data consists of the following tables customers, orders, products, and order_products, according to the following Entity Relations Diagram: + +![advanceddbt1](../images/advanceddbt1.svg) + +Using dbt, we leverage the source data tables to construct the following dimensional model, which is optimized for analytics tools. + +![advanceddbt2](../images/advanceddbt2.svg) + +### The sources + +* For Teddy Retailers, the `orders` and `order_products` sources are periodically updated by the organization's ELT (Extract, Load, Transform) process. +* The updated data only includes the latest changes rather than the entire dataset due to its large volume. +* To address this challenge, it is necessary to capture these incremental updates while preserving the previously available data. + +## The dbt models + +The `schema.yml` file in the project's models directory specifies the sources for our models. These sources align with the data we loaded from GCP using our SQL scripts. + +### Staging area + +The staging area models are merely ingesting the data from each of the sources and renaming each field, if appropiate. +In the schema.yml of this directory we define basic integrity checks for the primary keys. + +### Core area + +The following advanced dbt concepts are applied in the models at this stage: + +#### Incremental materializations + +The `schema.yml` file in this directory specifies that the materializations of the two models we are building are incremental. +We employ different strategies for these models: + +* For the `all_orders model`, we utilize the delete+insert strategy. This strategy is implemented because there may be changes in the status of an order that are included in the data updates. +* For the `all_order_products` model, we employ the default append strategy. This approach is chosen because the same combination of `order_id` and `product_id` may appear multiple times in the sources. +This indicates that a new quantity of the same product has been added or removed from a specific order. + +#### Macro assisted assertions + +Within the `all_order_products` model, we have included an assertion with the help of a macro to test and guarantee that the resulting model encompasses a unique combination of `order_id` and `product_id`. This combination denotes the latest quantity of products of a specific type per order. + +#### Teradata modifiers + +For both the `all_order` and `all_order_products` models, we have incorporated Teradata Modifiers to enhance tracking of these two core models. +To facilitate collecting statistics, we have added a `post_hook` that instructs the database connector accordingly. Additionally, we have created an index on the `order_id` column within the `all_orders` table. + + +## Running transformations + +### Create dimensional model with baseline data + +By executing dbt, we generate the dimensional model using the baseline data. + + +``` bash +dbt run +``` + +This will create both our core and dimensional models using the baseline data. + +### Test the data + +We can run our defined test by executing: + + +```bash +dbt test +``` + +### Running sample queries + +You can find sample business intelligence queries in the `references/query` path of the project. These queries allow you to analyze the factual data based on dimensions such as customers, orders, and products. + +### Mocking the ELT process + +The scripts for loading updates into the source data set can be found in the `references/inserts/update_data.sql` path of the project. + +After updating the data sources, you can proceed with the aforementioned steps: running dbt, testing the data, and executing sample queries. This will allow you to visualize the variations and incremental updates in the data. + +## Summary + +In this tutorial, we explored the utilization of advanced dbt concepts with Teradata Vantage. +The sample project showcased the transformation of source data into a dimensional data mart. +Throughout the project, we implemented several advanced dbt concepts, including incremental materializations, utility macros, and Teradata modifiers. + + \ No newline at end of file diff --git a/quickstarts/manage-data/airflow.md b/quickstarts/manage-data/airflow.md new file mode 100644 index 0000000000..d47e0295c9 --- /dev/null +++ b/quickstarts/manage-data/airflow.md @@ -0,0 +1,249 @@ +--- +sidebar_position: 3 +id: airflow +author: Adam Tworkiewicz +email: adam.tworkiewicz@teradata.com +page_last_update: September 7th, 2021 +description: Teradata Vantage Native Object Storage - read and write from/to object storage, unified SQL interface for Vantage and object storage. +keywords: [data warehouses, compute storage separation, teradata, vantage, cloud data platform, object storage, business intelligence, enterprise analytics] +--- + +import ClearscapeDocsNote from '../_partials/vantage_clearscape_analytics.mdx' +import CommunityLink from '../_partials/community_link.mdx' + +# Use Apache Airflow with Teradata Vantage + +## Overview + +Native Object Storage (NOS) is a Vantage feature that allows you to query data stored in files in object storage such as AWS S3, Google GCS, Azure Blob or on-prem implementations. It's useful in scenarios where you want to explore data without building a data pipeline to bring it into Vantage. + +## Prerequisites + +You need access to a Teradata Vantage instance. NOS is enabled in all Vantage editions from Vantage Express through Developer, DYI to Vantage as a Service starting from version 17.10. + + + +## Explore data with NOS + +:::note +Currently, NOS supports CSV, JSON (as array or new-line delimited), and Parquet data formats. +::: + +Let's say you have a dataset stored as CSV files in an S3 bucket. You want to explore the dataset before you decide if you want to bring it into Vantage. For this scenario, we are going to use a public dataset published by Teradata that contains river flow data collected by the +U.S. Geological Survey. The bucket is at https://td-usgs-public.s3.amazonaws.com/. + +Let's first have a look at sample CSV data. We take the first 10 rows that Vantage will fetch from the bucket: + +```sql +SELECT + TOP 10 * +FROM ( + LOCATION='/s3/td-usgs-public.s3.amazonaws.com/CSVDATA/' +) AS d; +``` + +Here is what I've got: + +```sql +GageHeight2 Flow site_no datetime Precipitation GageHeight +----------- ----- -------- ---------------- ------------- ----------- +10.9 15300 09380000 2018-06-28 00:30 671 9.80 +10.8 14500 09380000 2018-06-28 01:00 673 9.64 +10.7 14100 09380000 2018-06-28 01:15 672 9.56 +11.0 16200 09380000 2018-06-27 00:00 669 9.97 +10.9 15700 09380000 2018-06-27 00:30 668 9.88 +10.8 15400 09380000 2018-06-27 00:45 672 9.82 +10.8 15100 09380000 2018-06-27 01:00 672 9.77 +10.8 14700 09380000 2018-06-27 01:15 672 9.68 +10.9 16000 09380000 2018-06-27 00:15 668 9.93 +10.8 14900 09380000 2018-06-28 00:45 672 9.72 +``` + +We have got plenty of numbers, but what do they mean? To answer this question, we will ask Vantage to detect the schema of the CSV files: + +```sql +SELECT + * +FROM ( + LOCATION='/s3/td-usgs-public.s3.amazonaws.com/CSVDATA/' + RETURNTYPE='NOSREAD_SCHEMA' +) AS d; +``` + +Vantage will now fetch a data sample to analyze the schema and return results: + +```sql +Name Datatype FileType Location +--------------- ----------------------------------- --------- ------------------------------------------------------------------- +GageHeight2 decimal(3,2) csv /S3/s3.amazonaws.com/td-usgs-public/CSVDATA/09513780/2018/06/27.csv +Flow decimal(3,2) csv /S3/s3.amazonaws.com/td-usgs-public/CSVDATA/09513780/2018/06/27.csv +site_no int csv /S3/s3.amazonaws.com/td-usgs-public/CSVDATA/09513780/2018/06/27.csv +datetime TIMESTAMP(0) FORMAT'Y4-MM-DDBHH:MI' csv /S3/s3.amazonaws.com/td-usgs-public/CSVDATA/09513780/2018/06/27.csv +Precipitation decimal(3,2) csv /S3/s3.amazonaws.com/td-usgs-public/CSVDATA/09513780/2018/06/27.csv +GageHeight decimal(3,2) csv /S3/s3.amazonaws.com/td-usgs-public/CSVDATA/09513780/2018/06/27.csv +``` + + + +We see that the CSV files have 6 columns. For each column, we get the name, the datatype and the file coordinates that were used to infer the schema. + +## Query data with NOS + +Now that we know the schema, we can work with the dataset as if it was a regular SQL table. To prove the point, let's try to do some data aggregation. Let's get an average temperature per site for sites that collect temperatures. + +```sql +SELECT + site_no Site_no, AVG(Flow) Avg_Flow +FROM ( + LOCATION='/s3/td-usgs-public.s3.amazonaws.com/CSVDATA/' +) AS d +GROUP BY + site_no +HAVING + Avg_Flow IS NOT NULL; +``` + +Result: + +```sql +Site_no Avg_Flow +-------- --------- +09380000 11 +09423560 73 +09424900 93 +09429070 81 +``` + +To register your ad hoc exploratory activity as a permanent source, create it as a foreign table: + +```sql +-- If you are running this sample as dbc user you will not have permissions +-- to create a table in dbc database. Instead, create a new database and use +-- the newly create database to create a foreign table. + +CREATE DATABASE Riverflow + AS PERMANENT = 60e6, -- 60MB + SPOOL = 120e6; -- 120MB + +-- change current database to Riverflow +DATABASE Riverflow; + +CREATE FOREIGN TABLE riverflow + USING ( LOCATION('/s3/td-usgs-public.s3.amazonaws.com/CSVDATA/') ); + +SELECT top 10 * FROM riverflow; +``` + +Result: + +```sql +Location GageHeight2 Flow site_no datetime Precipitation GageHeight +------------------------------------------------------------------- ----------- ---- ------- ------------------- ------------- ---------- +/S3/s3.amazonaws.com/td-usgs-public/CSVDATA/09429070/2018/07/02.csv null null 9429070 2018-07-02 14:40:00 1.21 null +/S3/s3.amazonaws.com/td-usgs-public/CSVDATA/09400815/2018/07/10.csv null 0.00 9400815 2018-07-10 00:30:00 0.00 -0.01 +/S3/s3.amazonaws.com/td-usgs-public/CSVDATA/09400815/2018/07/10.csv null 0.00 9400815 2018-07-10 00:45:00 0.00 -0.01 +/S3/s3.amazonaws.com/td-usgs-public/CSVDATA/09400815/2018/07/10.csv null 0.00 9400815 2018-07-10 01:00:00 0.00 -0.01 +/S3/s3.amazonaws.com/td-usgs-public/CSVDATA/09400815/2018/07/10.csv null 0.00 9400815 2018-07-10 00:15:00 0.00 -0.01 +/S3/s3.amazonaws.com/td-usgs-public/CSVDATA/09429070/2018/07/02.csv null null 9429070 2018-07-02 14:38:00 1.06 null +``` + + +This time, the `SELECT` statement looks like a regular select against an in-database table. If you require subsecond response time when querying the data, there is an easy way to bring the CSV data into Vantage to speed things up. Read on to find out how. + +## Load data from NOS into Vantage + +Querying object storage takes time. What if you decided that the data looks interesting and you want to do some more analysis with a solution that will you quicker answers? The good news is that data returned with NOS can be used as a source for `CREATE TABLE` statements. Assuming you have `CREATE TABLE` privilege, you will be able to run: + +IMPORTANT: This query assumes you created database `Riverflow` and a foreign table called `riverflow` in the previous step. + +```sql +-- This query assumes you created database `Riverflow` +-- and a foreign table called `riverflow` in the previous step. + +CREATE MULTISET TABLE riverflow_native (site_no, Flow, GageHeight, datetime) +AS ( + SELECT site_no, Flow, GageHeight, datetime FROM riverflow +) WITH DATA +NO PRIMARY INDEX; + +SELECT TOP 10 * FROM riverflow_native; +``` + +Result: + +```sql +site_no Flow GageHeight datetime +------- ----- ---------- ------------------- +9400815 .00 -.01 2018-07-10 00:30:00 +9400815 .00 -.01 2018-07-10 01:00:00 +9400815 .00 -.01 2018-07-10 01:15:00 +9400815 .00 -.01 2018-07-10 01:30:00 +9400815 .00 -.01 2018-07-10 02:00:00 +9400815 .00 -.01 2018-07-10 02:15:00 +9400815 .00 -.01 2018-07-10 01:45:00 +9400815 .00 -.01 2018-07-10 00:45:00 +9400815 .00 -.01 2018-07-10 00:15:00 +9400815 .00 -.01 2018-07-10 00:00:00 +``` + +This time, the `SELECT` query returned in less than a second. Vantage didn't have to fetch the data from NOS. Instead, it answered using data that was already on its nodes. + +## Access private buckets + +So far, we have used a public bucket. What if you have a private bucket? How do you tell Vantage what credentials it should use? + +It is possible to inline your credentials directly into your query: + +```sql +SELECT + TOP 10 * +FROM ( + LOCATION='/s3/td-usgs-public.s3.amazonaws.com/CSVDATA/' + AUTHORIZATION='{"ACCESS_ID":"","ACCESS_KEY":""}' +) AS d; +``` + +Entering these credentials all the time can be tedious and less secure. In Vantage, you can create an authorization object that will serve as a container for your credentials: + +```sql +CREATE AUTHORIZATION aws_authorization + USER 'YOUR-ACCESS-KEY-ID' + PASSWORD 'YOUR-SECRET-ACCESS-KEY'; +``` + +You can then reference your authorization object when you create a foreign table: + +```sql +CREATE FOREIGN TABLE riverflow +, EXTERNAL SECURITY aws_authorization +USING ( LOCATION('/s3/td-usgs-public.s3.amazonaws.com/CSVDATA/') ); +``` + +## Export data from Vantage to object storage + +So far, we have talked about reading and importing data from object storage. Wouldn't it be nice if we had a way to use SQL to export data from Vantage to object storage? This is exactly what `WRITE_NOS` function is for. Let's say we want to export data from `riverflow_native` table to object storage. You can do so with the following query: + +```sql +SELECT * FROM WRITE_NOS ( + ON ( SELECT * FROM riverflow_native ) + PARTITION BY site_no ORDER BY site_no + USING + LOCATION('YOUR-OBJECT-STORE-URI') + AUTHORIZATION(aws_authorization) + STOREDAS('PARQUET') + COMPRESSION('SNAPPY') + NAMING('RANGE') + INCLUDE_ORDERING('TRUE') +) AS d; +``` + +Here, we instruct Vantage to take data from `riverflow_native` and save it in `YOUR-OBJECT-STORE-URI` bucket using `parquet` format. The data will be split into files by `site_no` attribute. The files will be compressed. + +## Summary + +In this quick start we have learned how to read data from object storage using Native Object Storage (NOS) functionality in Vantage. NOS supports reading and importing data stored in CSV, JSON and Parquet formats. NOS can also export data from Vantage to object storage. + +## Further reading +* [Teradata Vantage™ - Native Object Store Getting Started Guide](https://docs.teradata.com/r/2mw8ooFr~xX0EaaGFaDW8A/root) + + \ No newline at end of file diff --git a/quickstarts/manage-data/configure-a-teradata-vantage-connection-in-datahub.md b/quickstarts/manage-data/configure-a-teradata-vantage-connection-in-datahub.md new file mode 100644 index 0000000000..cbaf023948 --- /dev/null +++ b/quickstarts/manage-data/configure-a-teradata-vantage-connection-in-datahub.md @@ -0,0 +1,124 @@ +--- +sidebar_position: 18 +author: Paul Ibberson +email: paul.ibberson2@teradata.com +page_last_update: Deccember 19th, 2023 +description: Configure a Teradata Vantage connection in DataHub. +keywords: [data warehouses, compute storage separation, teradata, vantage, cloud data platform, object storage, business intelligence, enterprise analytics, datahub, data catalog, data lineage] +--- + +# Configure a Teradata Vantage connection in DataHub + +## Overview + +This how-to demonstrates how to create a connection to Teradata Vantage with DataHub, and ingest metadata about tables and views, along with usage and lineage information. + +## Prerequisites + +import ClearscapeDocsNote from '../_partials/vantage_clearscape_analytics.mdx' + +* Access to a Teradata Vantage instance. + +* DataHub installed. See [DataHub Quickstart Guide](https://datahubproject.io/docs/quickstart) + +## Setup DataHub + +* Install the Teradata plugin for DataHub in the environment where you have DataHub installed + +``` bash +pip install 'acryl-datahub[teradata]' +``` + +* Setup a Teradata user and set privileges to allow that user to read the dictionary tables + +``` sql +CREATE USER datahub FROM AS PASSWORD = PERM = 20000000; + +GRANT SELECT ON dbc.columns TO datahub; +GRANT SELECT ON dbc.databases TO datahub; +GRANT SELECT ON dbc.tables TO datahub; +GRANT SELECT ON DBC.All_RI_ChildrenV TO datahub; +GRANT SELECT ON DBC.ColumnsV TO datahub; +GRANT SELECT ON DBC.IndicesV TO datahub; +GRANT SELECT ON dbc.TableTextV TO datahub; +GRANT SELECT ON dbc.TablesV TO datahub; +GRANT SELECT ON dbc.dbqlogtbl TO datahub; -- if lineage or usage extraction is enabled +``` +* If you want to run profiling, you need to grant select permission on all the tables you want to profile. + +* If you want to extract lineage or usage metadata, query logging must be enabled and it is set to size which will fit for your queries (the default query text size Teradata captures is max 200 chars) An example how you can set it for all users: + +``` sql +-- set up query logging on all + +REPLACE QUERY LOGGING LIMIT SQLTEXT=2000 ON ALL; +``` + +## Add a Teradata connection to DataHub +With DataHub running, open the DataHub GUI and login. In this example this is running at localhost:9002 + +* Start the new connection wizard by clicking on the ingestion plug icon +![Ingestion Label](../other-integrations/images/configure-a-teradata-connection-in-datahub/ingestion-icon.png) + +and then selecting "Create new source" +![Create New Source](../other-integrations/images/configure-a-teradata-connection-in-datahub/create-new-source.png) + +* Scroll the list of available sources and select Other +![Select Source](../other-integrations/images/configure-a-teradata-connection-in-datahub/select-other-source.png) + +* A recipe is needed to configure the connection to Teradata and define the options required such as whether to capture table and column lineage, profile the data or retrieve usage statistics. Below is a simple recipe to get you started. The host, username and password should be changed to match your environment. + +``` yaml +pipeline_name: my-teradata-ingestion-pipeline +source: + type: teradata + config: + host_port: "myteradatainstance.teradata.com:1025" + username: myuser + password: mypassword + #database_pattern: + # allow: + # - "my_database" + # ignoreCase: true + include_table_lineage: true + include_usage_statistics: true + stateful_ingestion: + enabled: true +``` + +Pasting the recipe into the window should look like this: +![New Ingestion Source](../other-integrations/images/configure-a-teradata-connection-in-datahub/new-ingestion-source.png) + +* Click Next and then setup the required schedule. +![Set Schedule](../other-integrations/images/configure-a-teradata-connection-in-datahub/set-schedule.png) + +* Click Next to Finish Up and give the connection a name. Click Advanced so that the correct CLI version can be set. DataHub support for Teradata became available in CLI 0.12.x. Suggest selecting the most current version to ensure the best compatibility. +![Finish up](../other-integrations/images/configure-a-teradata-connection-in-datahub/finish-up.png) + +* Once the new source has been saved, it can be executed manually by clicking Run. +![Execute](../other-integrations/images/configure-a-teradata-connection-in-datahub/execute.png) + +Clicking on "Succeeded" after a sucessful execution will bring up a dialogue similar to this one where you can see the Databases, Tables and Views that have been ingested into DataHub. +![Ingestion Result](../other-integrations/images/configure-a-teradata-connection-in-datahub/ingestion-result.png) + +* The metadata can now be explored in the GUI by browsing: + * DataSets provides a list of the datasets (tables and views) loaded +![datasets](../other-integrations/images/configure-a-teradata-connection-in-datahub/datasets.png) + * Entities captured from the database +![Entities](../other-integrations/images/configure-a-teradata-connection-in-datahub/entities-list.png) + * Schema of an entity showing column/field names, data types and usage if it has been captured +![Schema display](../other-integrations/images/configure-a-teradata-connection-in-datahub/schema.png) + * Lineage providing a visual representation of how data is linked between tables and views +![Lineage picture](../other-integrations/images/configure-a-teradata-connection-in-datahub/lineage-weather.png) + +## Summary + +This how-to demonstrated how to create a connection to Teradata Vantage with DataHub in order to capture metadata of tables, views along with lineage and usage statistics. + +## Further reading +* [Integrate DataHub with Teradata Vantage](https://datahubproject.io/docs/generated/ingestion/sources/teradata) +* [DataHub Integration Options for Recipes](https://datahubproject.io/docs/metadata-ingestion/#recipes) + +import CommunityLinkPartial from '../_partials/community_link.mdx'; + + diff --git a/quickstarts/manage-data/connect-azure-data-share-to-teradata-vantage.md b/quickstarts/manage-data/connect-azure-data-share-to-teradata-vantage.md new file mode 100644 index 0000000000..a314b7f037 --- /dev/null +++ b/quickstarts/manage-data/connect-azure-data-share-to-teradata-vantage.md @@ -0,0 +1,610 @@ +--- +sidebar_position: 16 +author: Rupal Shah +email: rupal.shah@teradata.com +page_last_update: February 14th, 2022 +description: Connect Azure Data Share to Teradata Vantage. +keywords: [data warehouses, compute storage separation, teradata, vantage, cloud data platform, data cloud, data sharing] +--- + +# Connect Azure Data Share to Teradata Vantage + +### Overview + +This article describes the process to share an Azure Blob Storage dataset from one user to another using Azure Data Share service and then query it with Teradata Vantage leveraging Native Object Store (NOS) capability. We will create and use a storage account and data share account for both users. + +This is a diagram of the workflow. + +![](../cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image2.png) + + +### About Azure Data Share + +[Azure Data Share](https://docs.microsoft.com/en-us/azure/data-share/overview) enables organizations to simply and securely share data with multiple customers and partners. Both the data provider and data consumer must have an Azure subscription to share and receive data. Azure Data Share currently offers snapshot-based sharing and in-place sharing. Today, Azure Data Share [supported data stores](https://docs.microsoft.com/en-us/azure/data-share/supported-data-stores) include Azure Blob Storage, Azure Data Lake Storage Gen1 and Gen2, Azure SQL Database, Azure Synapse Analytics and Azure Data Explorer. Once a dataset share has been sent using Azure Data Share, the data consumer is able to receive that data in a data store of their choice like Azure Blob Storage and then use Teradata Vantage to explore and analyze the data. + +For more information see [documentation](https://docs.microsoft.com/en-us/azure/data-share). + +### About Teradata Vantage + +Vantage is the modern cloud platform that unifies data warehouses, data lakes, and analytics into a single connected ecosystem. + +Vantage combines descriptive, predictive, prescriptive analytics, autonomous decision-making, ML functions, and visualization tools into a unified, integrated platform that uncovers real-time business intelligence at scale, no matter where the data resides. + +Vantage enables companies to start small and elastically scale compute or storage, paying only for what they use, harnessing low-cost object stores and integrating their analytic workloads. + +Vantage supports R, Python, Teradata Studio, and any other SQL-based tools. You can deploy Vantage across public clouds, on-premises, on optimized or commodity infrastructure, or as-a-service. + +Teradata Vantage Native Object Store (NOS) can be used to explore data in external object stores, like Azure Blob Storage, using standard SQL. No special object storage-side compute infrastructure is required to use NOS. You can explore data located in an Blob Storage container by simply creating a NOS table definition that points to your container. With NOS, you can quickly import data from Blob Storage or even join it other tables in the database. + +Alternatively, the Teradata Parallel Transporter (TPT) utility can be used to import data from Blob Storage to Teradata Vantage in bulk fashion. Once loaded, data can be efficiently queried within Vantage. + +For more information see [*documentation*](https://docs.teradata.com/home). + +### Prerequisites + +import ClearscapeDocsNote from '../_partials/vantage_clearscape_analytics.mdx' + +* Access to a Teradata Vantage instance. + +* An Azure account. You can start with a [free account](https://azure.microsoft.com/free). +* An [Azure Blob Storage](https://docs.microsoft.com/en-us/azure/storage/common/storage-quickstart-create-account?tabs=azure-portal) account to store the dataset + +### Procedure + +Once you have met the prerequisites, follow these steps: + +1. Create a Azure Blob Storage account and container +2. Create a Data Share Account +3. Create a share +4. Accept and receive data using Data Share +5. Configure NOS access to Blob Storage +6. Query the dataset in Blob Storage +7. Load data from Blob Storage into Vantage (optional) + +### Create an Azure Blob Storage Account and Container + +* Open the [Azure portal](https://portal.azure.com/) in a browser (Chrome, Firefox, and Safari work well) and follow the steps in [create a storage account](https://docs.microsoft.com/en-us/azure/storage/common/storage-account-create?tabs=azure-portal#create-a-storage-account-1) in a resource group called _myProviderStorage_rg_ in this article. + +* Enter a storage name and connectivity method. We will use _myproviderstorage_ and _public endpoint_ in this article. + +:::note +We suggest that you use the same location for all services you create. +::: + +* Select **Review + create,** then **Create**. + +* *Go to resource* and click **Containers** to create container. + +* Click the **+ Container** button. + +* Enter a container name. We will use _providerdata_ in this article. +![](../cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image3.png) + +* Click **Create**. + +### Create a Data Share Account + +We will create a Data Share account for the provider sharing the dataset. + +Follow the [Create an Azure Data Share Account](https://docs.microsoft.com/en-us/azure/data-share/share-your-data?tabs=azure-portal#create-a-data-share-account) steps to create resource in a resource group called _myDataShareProvider_rg_ in this article. + +* In **Basics** tab, enter a data share account name. We will use _mydatashareprovider_ in this article. +![](../cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image4.png) +:::note +We suggest that you use the same location for all services you create. +::: + +* Select **Review + create,** then **Create**. + +* When the deployment is complete, select *Go to resource*. + +### Create a Share + +* Navigate to your Data Share Overview page and follow the steps in [Create a share](https://docs.microsoft.com/en-us/azure/data-share/share-your-data?tabs=azure-portal#create-a-share). + +* Select **Start sharing your data**. + +* Select + **Create**. + +* In **Details** tab, enter a share name and share type. We will use _WeatherData_ and _Snapshot_ in this article. +![](../cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image5.png) + +:::note Snapshot share +Choose snapshot sharing to provide copy of the data to the recipient. + +Supported data store: Azure Blob Storage, Azure Data Lake Storage Gen1, Azure Data Lake Storage Gen2, Azure SQL Database, Azure Synapse Analytics (formerly SQL DW) +::: + +:::note In-place share + +Choose in-place sharing to provide access to data at its source. + +Supported data store: Azure Data Explorer +::: + +* Click **Continue**. + +* In *Datasets* tab, click *Add datasets* + +* Select *Azure Blob Storage* +![](../cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image6.png) + +* Click *Next*. + +* Enter Storage account providing the dataset. We will use _myproviderstorage_ in this article. +![](../cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image7.png) + + +* Click ***Next**. + +* Double-click container to choose the dataset. We will use _providerdata_ and _onpoint_history_postal-code_hour.csv_ file in this article. +![](../cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image8.png) + +Figure 6 Select Storage container and dataset + +:::note +Azure Data Share can share at the folder and file level. Use Azure Blob Storage resource to upload a file. +::: + +* Click **Next**. + +* Enter a Dataset name that the consumer will see for the folder and dataset. We will use the default names but delete the providerdata folder this article. Click *Add datasets*. +![](../cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image9.png) + +* Click **Add datasets**. +![Dataset added to Sent Shares](../cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image10.png) + +* Click **Continue**. + +* In *Recipients* tab, click *Add recipient* email address to send share notification. + +* Enter email address for consumer. +![Add recipient email address](../cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image11.png) + +:::tip +Set Share expiration for amount of time share is valid for consumer to accept. +::: + +* Click **Continue**. + +* In *Settings* tab, set Snapshot schedule. We use default _unchecked_ this article. +![Set Snapshot schedule](../cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image12.png) + +* Click *Continue*. + +* In *Review + Create* tab, click *Create*. +![Review + Create](../cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image13.png) + +* Your Azure Data Share has now been created and the recipient of your Data Share is now ready to accept your invitation. +![Data Share ready and invitation sent to recipient](../cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image14.png) + +### Accept and Receive Data Using Azure Data Share + +In this article, the recipient/consumer is going to receive the data into their Azure Blob storage account. + +Similar to the Data Share _Provider,_ ensure that all pre-requisites are complete for the _Consumer_ before accepting a data share invitation. + +* Azure Subscription: If you don't have one, create a [+++free account+++](https://azure.microsoft.com/free/) before you begin. +* Azure Blob Storage account and container: create resource group called _myConsumerStorage_rg_ and create account name _myconsumerstorage_ and container _consumerdata_. +* Azure Data Share account: create resource group called _myDataShareConsumer_rg_ and create a data share account name called _mydatashareconsumer_ to accept the data. + +Follow the steps in [Accept and receive data using Azure Data Share](https://docs.microsoft.com/en-us/azure/data-share/subscribe-to-data-share?tabs=azure-portal). + +#### Open invitation + +* In your email, an invitation from Microsoft Azure with a subject titled "Azure Data Share invitation from *[+++yourdataprovider@domain.com+++](mailto:yourdataprovider@domain.com).* Click on the *View invitation* to see your invitation in Azure. +![Data Share email invitation to recipient](../cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image15.png) + +* This action opens your browser to the list of Data Share invitations. +![Data Share invitations](../cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image16.png) + +* Select the share you would like to view. We will select _WeatherData_ in this article. + +#### Accept invitation + +* Under _Target Data Share Account_, select the Subscription and Resource Group that you would like to deployed your Data Share into or you can create a new Data Share here. + +:::note +if provider required a Terms of Use acceptance, a dialog box would appear and you'll be required to check the box to indicate you agree to the terms of use. +::: + +* Enter the Resource group and Data share account. We will use _myDataShareConsumer_rg_ and _mydatashareconsumer_ account this article. +![Target Data Share account](../cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image17.png) + +* Select *Accept and configure* and a share subscription will be created. + +#### Configure received share + +* Select **Datasets** tab. Check the box next to the dataset you'd like to assign a destination to. Select + *Map to target* to choose a target data store. +![Select Dataset and Map to target](../cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image18.png) + +* Select a target data store type and path that you'd like the data to land in. We will use consumers Azure Blob Storage account _myconsumerstorage_ and container _consumerdata_ for our snapshot example in this article. + +:::note +Azure Data Share provides open and flexible data sharing, including the ability to share from and to different data stores. Check [supported](https://docs.microsoft.com/en-us/azure/data-share/supported-data-stores#supported-data-stores) data sources that can accept snapshot and in place sharing. +::: + +![Map datasets to target](../cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image19.png) + +* Click on **Map to target**. + +* Once mapping is complete, for snapshot-based sharing click on *Details* tab and click *Trigger snapshot* for _Full_ or _Incremental_. We will select full copy since this is your first time receiving data from your provider. +![Trigger full or incremental snapshot](../cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image20.png) + +* When the last run status is _successful_, go to target data store to view the received data. Select *Datasets*, and click on the link in the Target Path. +![Dataset and target path to view shared data](../cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image21.png) + + +### Configure NOS Access to Azure Blob Storage + +Native Object Store (NOS) can directly read data in Azure Blob Storage, which allows you to explore and analyze data in Blob Storage without explicitly loading the data. + +#### Create a foreign table definition + +A foreign table definition allows data in Blob Storage to be easily referenced within the Advanced SQL Engine and makes the data available in a structured, relational format. + +:::note +NOS supports data in CSV, JSON, and Parquet formats. +::: + +* Login to your Vantage system with Teradata Studio. + +* Create an AUTHORIZATION object to access your Blob Storage container with the following SQL command. + +``` sql +CREATE AUTHORIZATION DefAuth_AZ +AS DEFINER TRUSTED +USER 'myconsumerstorage' /* Storage Account Name */ +PASSWORD '*****************' /* Storage Account Access Key or SAS Token */ +``` + +** Replace the string for _USER_ with your Storage Account Name. +** Replace the string for _PASSWORD_ with your Storage Account Access Key or SAS Token. + +* Create a foreign table definition for the CSV file on Blob Storage with the following SQL command. + +``` sql +CREATE MULTISET FOREIGN TABLE WeatherData, +EXTERNAL SECURITY DEFINER TRUSTED DefAuth_AZ ( + Location VARCHAR(2048) CHARACTER SET UNICODE CASESPECIFIC, + Payload DATASET INLINE LENGTH 64000 STORAGE FORMAT CSV +) +USING ( + LOCATION ('/AZ/myconsumerstorage.blob.core.windows.net/consumerdata/') +) +``` + +:::note +At a minimum, the foreign table definition must include a table name (WeatherData) and a location clause, which points to the object store data. +::: + +The _LOCATION_ requires a storage account name and container name. You will need to replace this with your own storage account and container name. + +If the object doesn't have a standard extension (e.g. “.json”, “.csv”, “.parquet”), then the _Location…Payload_ columns definition phrase is also needed, and the LOCATION phase need to include the file name. For example: LOCATION (`AZ/.blob.core.windows.net//`). + +Foreign tables are always defined as No Primary Index (NoPI) tables. + +### Query the Dataset in Azure Blob Storage + +Run the following SQL command to query the dataset. + +``` sql +SELECT * FROM WeatherData SAMPLE 10; +``` + +The foreign table only contains two columns: Location and Payload. Location is the address in the object store system. The data itself is represented in the payload column, with the payload value within each record in the foreign table representing a single CSV row. + +![WeatherData table](../cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image22.png) + +Run the following SQL command to focus on the data in the object. + +``` sql +SELECT payload..* FROM WeatherData SAMPLE 10; +``` + +![WeatherData table payload](../cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image23.png) + +#### Create a View + +Views can simplify the names associated with the payload attributes, can make it easier to code SQL against the object data, and can hide the Location references in the foreign table. + + +:::note +Vantage foreign tables use the `..` (double dot or double period) operator to separate the object name from the column name. +::: + +* Run the following SQL command to create a view. + +``` sql +REPLACE VIEW WeatherData_view AS ( + SELECT + CAST(payload..postal_code AS VARCHAR(10)) Postal_code, + CAST(payload..country AS CHAR(2)) Country, + CAST(payload..time_valid_utc AS TIMESTAMP(0) FORMAT 'YYYY-MM-DDBHH:MI:SS') Time_Valid_UTC, + CAST(payload..doy_utc AS INTEGER) DOY_UTC, + CAST(payload..hour_utc AS INTEGER) Hour_UTC, + CAST(payload..time_valid_lcl AS TIMESTAMP(0) FORMAT 'YYYY-MM-DDBHH:MI:SS') Time_Valid_LCL, + CAST(payload..dst_offset_minutes AS INTEGER) DST_Offset_Minutes, + CAST(payload..temperature_air_2m_f AS DECIMAL(4,1)) Temperature_Air_2M_F, + CAST(payload..temperature_wetbulb_2m_f AS DECIMAL(3,1)) Temperature_Wetbulb_2M_F, + CAST(payload..temperature_dewpoint_2m_f AS DECIMAL(3,1)) Temperature_Dewpoint_2M_F, + CAST(payload..temperature_feelslike_2m_f AS DECIMAL(4,1)) Temperature_Feelslike_2M_F, + CAST(payload..temperature_windchill_2m_f AS DECIMAL(4,1)) Temperature_Windchill_2M_F, + CAST(payload..temperature_heatindex_2m_f AS DECIMAL(4,1)) Temperature_Heatindex_2M_F, + CAST(payload..humidity_relative_2m_pct AS DECIMAL(3,1)) Humidity_Relative_2M_Pct, + CAST(payload..humidity_specific_2m_gpkg AS DECIMAL(3,1)) Humdity_Specific_2M_GPKG, + CAST(payload..pressure_2m_mb AS DECIMAL(5,1)) Pressure_2M_Mb, + CAST(payload..pressure_tendency_2m_mb AS DECIMAL(2,1)) Pressure_Tendency_2M_Mb, + CAST(payload..pressure_mean_sea_level_mb AS DECIMAL(5,1)) Pressure_Mean_Sea_Level_Mb, + CAST(payload..wind_speed_10m_mph AS DECIMAL(3,1)) Wind_Speed_10M_MPH, + CAST(payload..wind_direction_10m_deg AS DECIMAL(4,1)) Wind_Direction_10M_Deg, + CAST(payload..wind_speed_80m_mph AS DECIMAL(3,1)) Wind_Speed_80M_MPH, + CAST(payload..wind_direction_80m_deg AS DECIMAL(4,1)) Wind_Direction_80M_Deg, + CAST(payload..wind_speed_100m_mph AS DECIMAL(3,1)) Wind_Speed_100M_MPH, + CAST(payload..wind_direction_100m_deg AS DECIMAL(4,1)) Wind_Direction_100M_Deg, + CAST(payload..precipitation_in AS DECIMAL(3,2)) Precipitation_in, + CAST(payload..snowfall_in AS DECIMAL(3,2)) Snowfall_in, + CAST(payload..cloud_cover_pct AS INTEGER) Cloud_Cover_Pct, + CAST(payload..radiation_solar_total_wpm2 AS DECIMAL(5,1)) Radiation_Solar_Total_WPM2 + FROM WeatherData +) +``` + +* Run the following SQL command to validate the view. + +``` sql +SELECT * FROM WeatherData_view SAMPLE 10; +``` +![WeatherData_view](../cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image24.png) + +Now that you have created a view, you can easily reference the object store data in a query and combine it with other tables, both relational tables in Vantage as well as foreign tables in an object store. This allows you to leverage the full analytic capabilities of Vantage on 100% of the data, no matter where the data is located. + +### Load Data from Blob Storage into Vantage (optional) + +Having a persistent copy of the Blob Storage data can be useful when repetitive access of the same data is expected. NOS does not automatically make a persistent copy of the Blob Storage data. Each time you reference a foreign table, Vantage will fetch the data from Blob Storage. (Some data may be cached, but this depends on the size of the data in Blob Storage and other active workloads in Vantage.) + +In addition, you may be charged network fees for data transferred from Blob Storage. If you will be referencing the data in Blob Storage multiple times, you may reduce your cost by loading it into Vantage, even temporarily. + +You can select among the approaches below to load the data into Vantage. + +#### Create the table and load the data in a single statement + +You can use a single statement to both create the table and load the data. You can choose the desired attributes from the foreign table payload and what they will be called in the relational table. + +A **CREATE TABLE AS … WITH DATA** statement can be used with the foreign table definition as the source table. + +* Run the following SQL command to create the relational table and load the data. + +``` sql +CREATE MULTISET TABLE WeatherData_temp AS ( + SELECT + CAST(payload..postal_code AS VARCHAR(10)) Postal_code, + CAST(payload..country AS CHAR(2)) Country, + CAST(payload..time_valid_utc AS TIMESTAMP(0) FORMAT 'YYYY-MM-DDBHH:MI:SS') Time_Valid_UTC, + CAST(payload..doy_utc AS INTEGER) DOY_UTC, + CAST(payload..hour_utc AS INTEGER) Hour_UTC, + CAST(payload..time_valid_lcl AS TIMESTAMP(0) FORMAT 'YYYY-MM-DDBHH:MI:SS') Time_Valid_LCL, + CAST(payload..dst_offset_minutes AS INTEGER) DST_Offset_Minutes, + CAST(payload..temperature_air_2m_f AS DECIMAL(4,1)) Temperature_Air_2M_F, + CAST(payload..temperature_wetbulb_2m_f AS DECIMAL(3,1)) Temperature_Wetbulb_2M_F, + CAST(payload..temperature_dewpoint_2m_f AS DECIMAL(3,1)) Temperature_Dewpoint_2M_F, + CAST(payload..temperature_feelslike_2m_f AS DECIMAL(4,1)) Temperature_Feelslike_2M_F, + CAST(payload..temperature_windchill_2m_f AS DECIMAL(4,1)) Temperature_Windchill_2M_F, + CAST(payload..temperature_heatindex_2m_f AS DECIMAL(4,1)) Temperature_Heatindex_2M_F, + CAST(payload..humidity_relative_2m_pct AS DECIMAL(3,1)) Humidity_Relative_2M_Pct, + CAST(payload..humidity_specific_2m_gpkg AS DECIMAL(3,1)) Humdity_Specific_2M_GPKG, + CAST(payload..pressure_2m_mb AS DECIMAL(5,1)) Pressure_2M_Mb, + CAST(payload..pressure_tendency_2m_mb AS DECIMAL(2,1)) Pressure_Tendency_2M_Mb, + CAST(payload..pressure_mean_sea_level_mb AS DECIMAL(5,1)) Pressure_Mean_Sea_Level_Mb, + CAST(payload..wind_speed_10m_mph AS DECIMAL(3,1)) Wind_Speed_10M_MPH, + CAST(payload..wind_direction_10m_deg AS DECIMAL(4,1)) Wind_Direction_10M_Deg, + CAST(payload..wind_speed_80m_mph AS DECIMAL(3,1)) Wind_Speed_80M_MPH, + CAST(payload..wind_direction_80m_deg AS DECIMAL(4,1)) Wind_Direction_80M_Deg, + CAST(payload..wind_speed_100m_mph AS DECIMAL(3,1)) Wind_Speed_100M_MPH, + CAST(payload..wind_direction_100m_deg AS DECIMAL(4,1)) Wind_Direction_100M_Deg, + CAST(payload..precipitation_in AS DECIMAL(3,2)) Precipitation_in, + CAST(payload..snowfall_in AS DECIMAL(3,2)) Snowfall_in, + CAST(payload..cloud_cover_pct AS INTEGER) Cloud_Cover_Pct, + CAST(payload..radiation_solar_total_wpm2 AS DECIMAL(5,1)) Radiation_Solar_Total_WPM2 + FROM + WeatherData + WHERE + Postal_Code = '36101' +) +WITH DATA +NO PRIMARY INDEX +``` + +* Run the following SQL command to validate the contents of the table. + +``` sql +SELECT * FROM WeatherData_temp SAMPLE 10; +``` +![](../cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image25.png)[Weather data,width=624,height=87] + +#### Create the table and load the data in multiple statements + +You can also use multiple statements to first create the relational table and then load the data. An advantage of this choice is that you can perform multiple loads, possibly selecting different data or loading in smaller increments if the object is very large. + +* Run the following SQL command to create the relational table. + +``` sql +CREATE MULTISET TABLE WeatherData_temp ( + Postal_code VARCHAR(10), + Country CHAR(2), + Time_Valid_UTC TIMESTAMP(0) FORMAT 'YYYY-MM-DDBHH:MI:SS', + DOY_UTC INTEGER, + Hour_UTC INTEGER, + Time_Valid_LCL TIMESTAMP(0) FORMAT 'YYYY-MM-DDBHH:MI:SS', + DST_Offset_Minutes INTEGER, + Temperature_Air_2M_F DECIMAL(4,1), + Temperature_Wetbulb_2M_F DECIMAL(3,1), + Temperature_Dewpoint_2M_F DECIMAL(3,1), + Temperature_Feelslike_2M_F DECIMAL(4,1), + Temperature_Windchill_2M_F DECIMAL(4,1), + Temperature_Heatindex_2M_F DECIMAL(4,1), + Humidity_Relative_2M_Pct DECIMAL(3,1), + Humdity_Specific_2M_GPKG DECIMAL(3,1), + Pressure_2M_Mb DECIMAL(5,1), + Pressure_Tendency_2M_Mb DECIMAL(2,1), + Pressure_Mean_Sea_Level_Mb DECIMAL(5,1), + Wind_Speed_10M_MPH DECIMAL(3,1), + Wind_Direction_10M_Deg DECIMAL(4,1), + Wind_Speed_80M_MPH DECIMAL(3,1), + Wind_Direction_80M_Deg DECIMAL(4,1), + Wind_Speed_100M_MPH DECIMAL(3,1), + Wind_Direction_100M_Deg DECIMAL(4,1), + Precipitation_in DECIMAL(3,2), + Snowfall_in DECIMAL(3,2), + Cloud_Cover_Pct INTEGER, + Radiation_Solar_Total_WPM2 DECIMAL(5,1) +) +UNIQUE PRIMARY INDEX ( Postal_Code, Time_Valid_UTC ) +``` + +* Run the following SQL to load the data into the table. + +``` sql +INSERT INTO WeatherData_temp + SELECT + CAST(payload..postal_code AS VARCHAR(10)) Postal_code, + CAST(payload..country AS CHAR(2)) Country, + CAST(payload..time_valid_utc AS TIMESTAMP(0) FORMAT 'YYYY-MM-DDBHH:MI:SS') Time_Valid_UTC, + CAST(payload..doy_utc AS INTEGER) DOY_UTC, + CAST(payload..hour_utc AS INTEGER) Hour_UTC, + CAST(payload..time_valid_lcl AS TIMESTAMP(0) FORMAT 'YYYY-MM-DDBHH:MI:SS') Time_Valid_LCL, + CAST(payload..dst_offset_minutes AS INTEGER) DST_Offset_Minutes, + CAST(payload..temperature_air_2m_f AS DECIMAL (4,1)) Temperature_Air_2M_F, + CAST(payload..temperature_wetbulb_2m_f AS DECIMAL(3,1)) Temperature_Wetbulb_2M_F, + CAST(payload..temperature_dewpoint_2m_f AS DECIMAL(3,1)) Temperature_Dewpoint_2M_F, + CAST(payload..temperature_feelslike_2m_f AS DECIMAL(4,1)) Temperature_Feelslike_2M_F, + CAST(payload..temperature_windchill_2m_f AS DECIMAL(4,1)) Temperature_Windchill_2M_F, + CAST(payload..temperature_heatindex_2m_f AS DECIMAL(4,1)) Temperature_Heatindex_2M_F, + CAST(payload..humidity_relative_2m_pct AS DECIMAL(3,1)) Humidity_Relative_2M_Pct, + CAST(payload..humidity_specific_2m_gpkg AS DECIMAL(3,1)) Humdity_Specific_2M_GPKG, + CAST(payload..pressure_2m_mb AS DECIMAL(5,1)) Pressure_2M_Mb, + CAST(payload..pressure_tendency_2m_mb AS DECIMAL(2,1)) Pressure_Tendency_2M_Mb, + CAST(payload..pressure_mean_sea_level_mb AS DECIMAL(5,1)) Pressure_Mean_Sea_Level_Mb, + CAST(payload..wind_speed_10m_mph AS DECIMAL(3,1)) Wind_Speed_10M_MPH, + CAST(payload..wind_direction_10m_deg AS DECIMAL(4,1)) Wind_Direction_10M_Deg, + CAST(payload..wind_speed_80m_mph AS DECIMAL(3,1)) Wind_Speed_80M_MPH, + CAST(payload..wind_direction_80m_deg AS DECIMAL(4,1)) Wind_Direction_80M_Deg, + CAST(payload..wind_speed_100m_mph AS DECIMAL(3,1)) Wind_Speed_100M_MPH, + CAST(payload..wind_direction_100m_deg AS DECIMAL(4,1)) Wind_Direction_100M_Deg, + CAST(payload..precipitation_in AS DECIMAL(3,2)) Precipitation_in, + CAST(payload..snowfall_in AS DECIMAL(3,2)) Snowfall_in, + CAST(payload..cloud_cover_pct AS INTEGER) Cloud_Cover_Pct, + CAST(payload..radiation_solar_total_wpm2 AS DECIMAL(5,1)) Radiation_Solar_Total_WPM2 + FROM + WeatherData + WHERE + Postal_Code = '30301' +``` + +* Run the following SQL command to validate the contents of the table. + +``` sql +SELECT * FROM WeatherData_temp SAMPLE 10; +``` +![](../cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image26.png)[WeatherData_temp,width=624,height=84] + +#### READ_NOS - An alternative method to foreign tables + +An alternative to defining a foreign table is to use the `READ_NOS` table operator. This table operator allows you to access data directly from an object store without first creating a foreign table, or to view a list of the keys associated with all the objects specified by a Location clause. + +You can use the `READ_NOS` table operator to explore the data in an object. + +* Run the following command to explore the data in an object. + +``` sql , id="azure_data_share_first_run", role="content-editable emits-gtm-events" +SELECT + TOP 5 payload..* +FROM + READ_NOS ( + ON (SELECT CAST( NULL AS DATASET STORAGE FORMAT CSV)) + USING + LOCATION ('/AZ/myconsumerstorage.blob.core.windows.net/consumerdata') + ACCESS_ID('myconsumerstorage') + ACCESS_KEY('*****') + ) AS THE_TABLE + ORDER BY 1 +``` + + * The _LOCATION_ requires a storage account name and container name. This is highlighted above in yellow. You will need to replace this with your own storage account and container name. + * Replace the string for _ACCESS_ID_ with your Storage Account Name. + * Replace the string for _ACCES_KEY_ with your Storage Account Access Key or SAS Token + +![](../cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image27.png)[READ_NOS,width=624,height=86] + +You can also leverage the READ_NOS table operator to get the length (size) of the object. + +* Run the following SQL command to view the size of the object. + +``` sql +SELECT + location(CHAR(120)), ObjectLength +FROM + READ_NOS ( + ON (SELECT CAST( NULL AS DATASET STORAGE FORMAT CSV)) + USING + LOCATION ('/AZ/myconsumerstorage.blob.core.windows.net/consumerdata') + ACCESS_ID('myconsumerstorage') + ACCESS_KEY('*****') + RETURNTYPE('NOSREAD_KEYS') + ) AS THE_TABLE +ORDER BY 1 +``` + * Replace the values for _LOCATION_, _ACCESS_ID_, and _ACCESS_KEY_. + +![READ_NOS object length](../cloud-guides/images/connect-azure-data-share-to-teradata-vantage/image28.png) + +You can substitute the NOS_READ table operator for a foreign table definition in the above section for loading the data into a relational table. + +``` sql +CREATE MULTISET TABLE WeatherData_temp AS ( + SELECT + CAST(payload..postal_code AS VARCHAR(10)) Postal_code, + CAST(payload..country AS CHAR(2)) Country, + CAST(payload..time_valid_utc AS TIMESTAMP(0) FORMAT 'YYYY-MM-DDBHH:MI:SS') Time_Valid_UTC, + CAST(payload..doy_utc AS INTEGER) DOY_UTC, + CAST(payload..hour_utc AS INTEGER) Hour_UTC, + CAST(payload..time_valid_lcl AS TIMESTAMP(0) FORMAT 'YYYY-MM-DDBHH:MI:SS') Time_Valid_LCL, + CAST(payload..dst_offset_minutes AS INTEGER) DST_Offset_Minutes, + CAST(payload..temperature_air_2m_f AS DECIMAL (4,1)) Temperature_Air_2M_F, + CAST(payload..temperature_wetbulb_2m_f AS DECIMAL(3,1)) Temperature_Wetbulb_2M_F, + CAST(payload..temperature_dewpoint_2m_f AS DECIMAL(3,1)) Temperature_Dewpoint_2M_F, + CAST(payload..temperature_feelslike_2m_f AS DECIMAL(4,1)) Temperature_Feelslike_2M_F, + CAST(payload..temperature_windchill_2m_f AS DECIMAL(4,1)) Temperature_Windchill_2M_F, + CAST(payload..temperature_heatindex_2m_f AS DECIMAL(4,1)) Temperature_Heatindex_2M_F, + CAST(payload..humidity_relative_2m_pct AS DECIMAL(3,1)) Humidity_Relative_2M_Pct, + CAST(payload..humidity_specific_2m_gpkg AS DECIMAL(3,1)) Humdity_Specific_2M_GPKG, + CAST(payload..pressure_2m_mb AS DECIMAL(5,1)) Pressure_2M_Mb, + CAST(payload..pressure_tendency_2m_mb AS DECIMAL(2,1)) Pressure_Tendency_2M_Mb, + CAST(payload..pressure_mean_sea_level_mb AS DECIMAL(5,1)) Pressure_Mean_Sea_Level_Mb, + CAST(payload..wind_speed_10m_mph AS DECIMAL(3,1)) Wind_Speed_10M_MPH, + CAST(payload..wind_direction_10m_deg AS DECIMAL(4,1)) Wind_Direction_10M_Deg, + CAST(payload..wind_speed_80m_mph AS DECIMAL(3,1)) Wind_Speed_80M_MPH, + CAST(payload..wind_direction_80m_deg AS DECIMAL(4,1)) Wind_Direction_80M_Deg, + CAST(payload..wind_speed_100m_mph AS DECIMAL(3,1)) Wind_Speed_100M_MPH, + CAST(payload..wind_direction_100m_deg AS DECIMAL(4,1)) Wind_Direction_100M_Deg, + CAST(payload..precipitation_in AS DECIMAL(3,2)) Precipitation_in, + CAST(payload..snowfall_in AS DECIMAL(3,2)) Snowfall_in, + CAST(payload..cloud_cover_pct AS INTEGER) Cloud_Cover_Pct, + CAST(payload..radiation_solar_total_wpm2 AS DECIMAL(5,1)) Radiation_Solar_Total_WPM2 + FROM + READ_NOS ( + ON (SELECT CAST( NULL AS DATASET STORAGE FORMAT CSV)) + USING + LOCATION ('/AZ/myconsumerstorage.blob.core.windows.net/consumerdata') + ACCESS_ID('myconsumerstorage') + ACCESS_KEY('*****') + ) AS THE_TABLE + WHERE + Postal_Code = '36101' +) +WITH DATA +``` + +import CommunityLinkPartial from '../_partials/community_link.mdx'; + + diff --git a/quickstarts/manage-data/create-parquet-files-in-object-storage.md b/quickstarts/manage-data/create-parquet-files-in-object-storage.md new file mode 100644 index 0000000000..13b54a0db7 --- /dev/null +++ b/quickstarts/manage-data/create-parquet-files-in-object-storage.md @@ -0,0 +1,148 @@ +--- +sidebar_position: 12 +id: create-parquet-files-in-object-storage +author: Obed Vega +email: obed.vega@teradata.com +page_last_update: August 2nd, 2022 +description: Teradata Vantage Native Object Storage - read and write from/to object storage, unified SQL interface for Vantage and object storage. +keywords: [data warehouses, compute storage separation, Teradata, vantage, cloud data platform, object storage, business intelligence, enterprise analytics, parquet, create parquet files] +--- + +import UseCase from '../_partials/use-csae.mdx'; +import CommunityLink from '../_partials/community_link.mdx'; + +# Create Parquet files in object storage + +## Overview +Native Object Storage (NOS) is a Vantage feature that allows you to query data stored in files such as CSV, JSON, and Parquet format datasets. +These datasets are located on external S3-compatible object storage such as AWS S3, Google GCS, Azure Blob or on-prem implementations. +It's useful in scenarios where you want to explore data without building a data pipeline to bring it into Vantage. This tutorial demonstrates how to export data from Vantage to object storage using the Parquet file format. + +## Prerequisites + +You need access to a Teradata Vantage instance. NOS is enabled in all Vantage editions from Vantage Express through Developer, DYI to Vantage as a Service starting from version 17.10. + +IMPORTANT: This tutorial is based on s3 aws object storage. You will need your own s3 bucket with write permissions to complete the tutorial. + + + +## Create a Parquet file with WRITE_NOS function + +`WRITE_NOS` allows you to extract selected or all columns from a database table or from derived results and write to external object storage, such as Amazon S3, Azure Blob storage, Azure Data Lake Storage Gen2, and Google Cloud Storage. This functionality stores data in Parquet format. + +You can find more documentation about `WRITE_NOS` functionality in the [NOS documentation](https://docs.teradata.com/r/Teradata-VantageTM-Native-Object-Store-Getting-Started-Guide/June-2022/Writing-Data-to-External-Object-Store). + +You will need access to a database where you can execute `WRITE_NOS` function. If you don't have such a database, run the following commands: + +``` sql +CREATE USER db AS PERM=10e7, PASSWORD=db; + +-- Don't forget to give the proper access rights +GRANT EXECUTE FUNCTION on TD_SYSFNLIB.READ_NOS to db; +GRANT EXECUTE FUNCTION on TD_SYSFNLIB.WRITE_NOS to db; +``` + +:::note +If you would like to learn more about setting up users and their privileges, checkout the [NOS documentation](https://docs.teradata.com/r/Teradata-VantageTM-Native-Object-Store-Getting-Started-Guide/June-2022/Setting-Up-Access/Setting-Access-Privileges). +::: + + +1. Let's first create a table on your Teradata Vantage instance: + +```sql +CREATE SET TABLE db.parquet_table ,FALLBACK , + NO BEFORE JOURNAL, + NO AFTER JOURNAL, + CHECKSUM = DEFAULT, + DEFAULT MERGEBLOCKRATIO, + MAP = TD_MAP1 + ( + column1 SMALLINT NOT NULL, + column2 DATE FORMAT 'YY/MM/DD' NOT NULL, + column3 DECIMAL(10,2)) +PRIMARY INDEX ( column1 ); +``` + +2. Populate your table with example data: +```sql +INSERT INTO db.parquet_table (1,'2022/01/01',1.1); +INSERT INTO db.parquet_table (2,'2022/01/02',2.2); +INSERT INTO db.parquet_table (3,'2022/01/03',3.3); +``` + +Your table should now look like this: + +```sql +column1 column2 column3 +------- -------- ------------ + 1 22/01/01 1.10 + 2 22/01/02 2.20 + 3 22/01/03 3.30 +``` + +3. Create the parquet file with `WRITE_NOS`. Don't forget to replace `` with the name of your s3 bucket. Also,replace `` and `` with your access key and secret. + +:::note +Check your cloud provider docs how to create credentials to access object storage. For example, for AWS check out [How do I create an AWS access key?](https://aws.amazon.com/premiumsupport/knowledge-center/create-access-key/) +::: + +```sql +SELECT * FROM WRITE_NOS ( +ON ( SELECT * FROM db.parquet_table) +USING +LOCATION('/s3/.s3.amazonaws.com/parquet_file_on_NOS.parquet') +AUTHORIZATION('{"ACCESS_ID":"", +"ACCESS_KEY":""}') +STOREDAS('PARQUET') +MAXOBJECTSIZE('16MB') +COMPRESSION('SNAPPY') +INCLUDE_ORDERING('TRUE') +INCLUDE_HASHBY('TRUE') +) as d; +``` + +Now you have created a parquet file in your object storage bucket. Now to easily query your file you need to follow step number 4. + +4. Create a NOS-backed foreign table. Don't forget to replace `` with the name of your s3 bucket. Also,replace `` and `` with your access key and secret: +```sql +CREATE MULTISET FOREIGN TABLE db.parquet_table_to_read_file_on_NOS +, EXTERNAL SECURITY DEFINER TRUSTED CEPH_AUTH, +MAP = TD_MAP1 +( + Location VARCHAR(2048) CHARACTER SET UNICODE CASESPECIFIC + , col1 SMALLINT + , col2 DATE + , col3 DECIMAL(10,2) + +) +USING ( + LOCATION ('/s3/.s3.amazonaws.com/parquet_file_on_NOS.parquet') + AUTHORIZATION('{"ACCESS_ID":"", + "ACCESS_KEY":""}') + STOREDAS ('PARQUET') +)NO PRIMARY INDEX; +``` + +5. Now you are ready to Query your parquet file on NOS, let's try the following query: +```sql +SELECT col1, col2, col3 FROM db.parquet_table_to_read_file_on_NOS; +``` + +The data returned from the query should look something like this: + +```sql + col1 col2 col3 +------ -------- ------------ + 1 22/01/01 1.10 + 2 22/01/02 2.20 + 3 22/01/03 3.30 +``` + +## Summary + +In this tutorial we have learned how to export data from Vantage to a parquet file on object storage using Native Object Storage (NOS). NOS supports reading and importing data stored in CSV, JSON and Parquet formats. NOS can also export data from Vantage to object storage. + +## Further reading +* [Teradata Vantage™ - Writing Data to External Object Store](https://docs.teradata.com/r/Teradata-VantageTM-Native-Object-Store-Getting-Started-Guide/June-2022/Writing-Data-to-External-Object-Store) + + \ No newline at end of file diff --git a/quickstarts/manage-data/dbt.md b/quickstarts/manage-data/dbt.md new file mode 100644 index 0000000000..f66b300d5d --- /dev/null +++ b/quickstarts/manage-data/dbt.md @@ -0,0 +1,148 @@ +--- +id: dbt +sidebar_position: 4 +author: Adam Tworkiewicz +email: adam.tworkiewicz@teradata.com +page_last_update: July 12th, 2023 +description: Use dbt (data build tool) with Teradata Vantage. +keywords: [data warehouses, compute storage separation, teradata, vantage, cloud data platform, object storage, business intelligence, enterprise analytics, elt, dbt.] +--- + +import ClearscapeDocsNote from '../_partials/vantage_clearscape_analytics.mdx' +import CommunityLink from '../_partials/community_link.mdx' +import tabsDBT from '../_partials/tabsDBT.mdx' + +# dbt with Teradata Vantage + +## Overview + +This tutorial demonstrates how to use dbt (Data Build Tool) with Teradata Vantage. It's based on the original [dbt Jaffle Shop tutorial](https://github.com/dbt-labs/jaffle_shop-dev). A couple of models have been adjusted to the SQL dialect supported by Vantage. + +## Prerequisites + +* Access to a Teradata Vantage instance. + + +* Python *3.7*, *3.8*, *3.9*, *3.10* or *3.11* installed. + +## Install dbt + +1. Clone the tutorial repository and cd into the project directory: +``` bash +git clone https://github.com/Teradata/jaffle_shop-dev.git jaffle_shop +cd jaffle_shop +``` + +2. Create a new python environment to manage dbt and its dependencies. Activate the environment: + + + +3. Install `dbt-teradata` module and its dependencies. The core dbt module is included as a dependency so you don't have to install it separately: +```bash +pip install dbt-teradata +``` + +## Configure dbt + +We will now configure dbt to connect to your Vantage database. Create file `$HOME/.dbt/profiles.yml` with the following content. Adjust ``, ``, `` to match your Teradata instance. + +:::note +* Database setup + +The following dbt profile points to a database called `jaffle_shop`. +If the database doesn't exist on your Teradata Vantage instance, it will be created. You can also change `schema` value to point to an existing database in your instance. +::: + +```bash +jaffle_shop: + outputs: + dev: + type: teradata + host: + user: + password: + logmech: TD2 + schema: jaffle_shop + tmode: ANSI + threads: 1 + timeout_seconds: 300 + priority: interactive + retries: 1 + target: dev +``` + +Now, that we have the profile file in place, we can validate the setup: + +```bash +dbt debug +``` + +If the debug command returned errors, you likely have an issue with the content of `profiles.yml`. + +## About the Jaffle Shop warehouse + +`jaffle_shop` is a fictional e-commerce store. This dbt project transforms raw data from an app database into a dimensional model with customer and order data ready for analytics. + +The raw data from the app consists of customers, orders, and payments, with the following entity-relationship diagram: + +![](../images/dbt1.svg) + +dbt takes these raw data table and builds the following dimensional model, which is more suitable for analytics tools: + +![](../images/dbt2.svg) + +## Run dbt + +### Create raw data tables + +In real life, we will be getting raw data from platforms like Segment, Stitch, Fivetran or another ETL tool. In our case, we will use dbt's `seed` functionality to create tables from csv files. The csv files are located in `./data` directory. Each csv file will produce one table. dbt will inspect the files and do type inference to decide what data types to use for columns. + +```bash +dbt seed +``` + +You should now see 3 tables in your `jaffle_shop` database: `raw_customers`, `raw_orders`, `raw_payments`. The tables should be populated with data from the csv files. + +### Create the dimensional model + +Now that we have the raw tables, we can instruct dbt to create the dimensional model: +```bash +dbt run +``` + +So what exactly happened here? dbt created additional tables using `CREATE TABLE/VIEW FROM SELECT` SQL. In the first transformation, dbt took raw tables and built denormalized join tables called `customer_orders`, `order_payments`, `customer_payments`. You will find the definitions of these tables in `./marts/core/intermediate`. +In the second step, dbt created `dim_customers` and `fct_orders` tables. These are the dimensional model tables that we want to expose to our BI tool. + +### Test the data + +dbt applied multiple transformations to our data. How can we ensure that the data in the dimensional model is correct? dbt allows us to define and execute tests against the data. The tests are defined in `./marts/core/schema.yml`. The file describes each column in all relationships. Each column can have multiple tests configured under `tests` key. For example, we expect that `fct_orders.order_id` column will contain unique, non-null values. To validate that the data in the produced tables satisfies the test conditions run: + +```bash +dbt test +``` + +### Generate documentation + +Our model consists of just a few tables. Imagine a scenario where where we have many more sources of data and a much more complex dimensional model. We could also have an intermediate zone between the raw data and the dimensional model that follows the Data Vault 2.0 principles. Would it not be useful, if we had the inputs, transformations and outputs documented somehow? dbt allows us to generate documentation from its configuration files: + +```bash +dbt docs generate +``` + +This will produce html files in `./target` directory. + +You can start your own server to browse the documentation. The following command will start a server and open up a browser tab with the docs' landing page: + +```bash +dbt docs serve +``` + +## Summary + +This tutorial demonstrated how to use dbt with Teradata Vantage. The sample project takes raw data and produces a dimensional data mart. We used multiple dbt commands to populate tables from csv files (`dbt seed`), create models (`dbt run`), test the data (`dbt test`), and generate and serve model documentation (`dbt docs generate`, `dbt docs serve`). + +## Further reading +* [dbt documentation](https://docs.getdbt.com/docs/) +* [dbt-teradata plugin documentation](https://github.com/Teradata/dbt-teradata) + + diff --git a/quickstarts/manage-data/execute-airflow-workflows-that-use-dbt-with-teradata-vantage.md b/quickstarts/manage-data/execute-airflow-workflows-that-use-dbt-with-teradata-vantage.md new file mode 100644 index 0000000000..da41b2b6cb --- /dev/null +++ b/quickstarts/manage-data/execute-airflow-workflows-that-use-dbt-with-teradata-vantage.md @@ -0,0 +1,290 @@ +--- +sidebar_position: 13 +author: Igor Machin, Ambrose Inman +email: igor.machin@teradata.com +page_last_update: November 18th, 2022 +description: Execute Airflow workflows that use dbt with Teradata Vantage +keywords: [data warehouses, compute storage separation, teradata, vantage, cloud data platform, object storage, business intelligence, enterprise analytics, airflow, queries, dbt] +--- + +# Execute Airflow workflows that use dbt with Teradata Vantage + +## Overview + +This tutorial demonstrates how to install Airflow on an AWS EC2 VM, configure the workflow to use dbt, and run it against a Teradata Vantage database. Airflow is a task scheduling tool that is typically used to build data pipelines to process and load data. In this example, we go through the Airflow installation process, which creates a Docker-based Airflow environment. Once Airflow is installed, we run several Airflow DAG (Direct Acyclic Graph, or simply workflow) examples that load data into a Teradata Vantage database. + +## Prerequsites + +1. Access to AWS (Amazon Web Services) with permissions to create a VM. +:::tip +This tutorial can be adjusted to other compute platforms or even on a bare metal machine as long as it has a computing and storage capacity comparable to the machine mentioned in this document (t2.2xlarge EC2 on AWS with approximately 100GB of storage) and is connected to the internet. If you decide to use a different compute platform, some steps in the tutorial will have to be altered. +::: +2. An SSH client. +:::tip +If you are on a Mac or a Linux machine, these tools are already included. If you are on Windows, consider [PuTTY](https://www.putty.org) or [MobaXterm](https://mobaxterm.mobatek.net/download.html). +::: +3. Access to a Teradata Vantage database. If you don't have access to Teradata Vantage, explore [Vantage Express](https://quickstarts.teradata.com/#getting-access-to-vantage) - a free edition for developers. + +## Install and execute Airflow + +### Create a VM +1. Go to the AWS EC2 console and click on `Launch instance`. +2. Select `Red Hat` for OS image. +3. Select `t2.2xlarge` for instance type. +4. Create a new key pair or use an existing one. +5. Apply network settings that will allow you ssh to the server and the server will have outbound connectivity to the Internet. Usually, applying the default settings will do. +6. Assign 100GB of storage. + +### Install Python + +1. ssh to the machine using `ec2-user` user. + +2. Check if python is installed (should be Python 3.7 or higher). Type `python` or `python3` on the command line. + +3. If python is not installed (you are getting `command not found` message) run the commands below to install it. The commands may require you to confirm the installation by typing `y` and enter. + +``` bash , id="install_python", role="content-editable emits-gtm-events" +sudo yum install python3 +# create a virtual environment for the project +sudo yum install python3-pip +sudo pip3 install virtualenv +``` + +### Create an Airflow environment + +1. Create the Airflow directory structure (from the ec2-user home directory /home/ec2-user) + +``` bash , id="install_airflow", role="content-editable emits-gtm-events" +mkdir airflow +cd airflow +mkdir -p ./dags ./logs ./plugins ./data ./config ./data +echo -e "AIRFLOW_UID=$(id -u)" > .env +``` + +2. Use your preferred file transfer tool (`scp`, `PuTTY`, `MobaXterm`, or similar) to upload [airflow.cfg](./attachments/execute-airflow-workflows-that-use-dbt-with-teradata-vantage/airflow.cfg) file to `airflow/config` directory. + +### Install Docker + +Docker is a containerization tool that allows us to install Airflow in a containerized environment. + +:::note +The steps must be executed in `airflow` directory. +::: + +1. Uninstall podman (RHEL containerization tool) + +``` bash , id="uninstall_podman", role="content-editable emits-gtm-events" +sudo yum remove docker \ +docker-client \ +docker-client-latest \ +docker-common \ +docker-latest \ +docker-latest-logrotate \ +docker-logrotate \ +docker-engine \ +podman \ +runc +``` + +2. Install yum utilities + +``` bash , id="install_yum", role="content-editable emits-gtm-events" +sudo yum install -y yum-utils +``` + +3. Add docker to yum repository. + +``` bash , id="add_docker_to_yum", role="content-editable emits-gtm-events" +sudo yum-config-manager \ +--add-repo \ +https://download.docker.com/linux/centos/docker-ce.repo +``` + +4. Install docker. + +``` bash , id="install_docker", role="content-editable emits-gtm-events" +sudo yum install docker-ce docker-ce-cli containerd.io +``` + +5. Start docker as a service. The first command runs the docker service automatically when the system starts up next time. The second command starts Docker now. + +``` bash , id="start_docker", role="content-editable emits-gtm-events" +sudo systemctl enable docker +sudo systemctl start docker +``` + +6. Check if Docker is installed correctly. This command should return an empty list of containers (since we have not started any container yet): + +``` bash , id="check_docker", role="content-editable emits-gtm-events" +sudo docker ps +``` + +### Install `docker-compose` and docker environment configuration files + +1. Upload [Dockerfile](./attachments/execute-airflow-workflows-that-use-dbt-with-teradata-vantage/docker-compose.yaml) and [Dockerfile](./attachments/execute-airflow-workflows-that-use-dbt-with-teradata-vantage/Dockerfile) files to the VM and save them in `airflow` directory. + +:::tip +What `docker-compose.yaml` and `Dockerfile` do +`docker-compose.yaml` and `Dockerfile` files are necessary to build the environment during the installation. The `docker-compose.yaml` file downloads and installs the Airflow docker container. The container includes the web ui, a Postgres database for metadata, the scheduler, 3 workers (so 3 tasks can be run in parallel), the trigger and the nginx web server to show the docs produced by `dbt`. In addition host directories are mounted on containers and various other install processes are performed. `Dockerfile` will additionally install needed packages in each container. + +If you would like to learn more what `docker-compose.yaml` and `Dockerfile` files do, examine these files. There are comments which clarify what is installed and why. +::: + +2. Install docker-compose (necessary to run the yaml file). + +:::note +The instructions are based on version 1.29.2. Check out https://github.com/docker/compose/releases site for the latest release and update the command below as needed. +::: + +``` bash , id="install_docker_compose", role="content-editable emits-gtm-events" +sudo curl -L https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$(uname -s)-$(uname -m) -o /usr/local/bin/docker-compose +sudo chmod +x /usr/local/bin/docker-compose +sudo ln -s /usr/local/bin/docker-compose /usr/bin/docker-compose +``` + +3. Test your docker-compose installation. The command should return the docker-compose version, for example `docker-compose version 1.29.2, build 5becea4c`: + +``` bash , id="check_docker_compose", role="content-editable emits-gtm-events" +docker-compose --version +``` + +### Install a test dbt project + +:::note +These steps set up a sample dbt project. `dbt` tool itself will be installed on the containers later by `docker-compose`. +::: + +1. Install git: + +``` bash , id="install_git", role="content-editable emits-gtm-events" +sudo yum install git +``` + +2. Get the sample jaffle shop dbt project: + +:::note +The dbt directories will be created under the home directory (not under `airflow`). The home directory in our example is `/home/ec2-user`. +::: + +``` bash , id="download_sample_dbt_project", role="content-editable emits-gtm-events" +# move to home dir +cd +mkdir dbt +cd dbt +git clone https://github.com/Teradata/jaffle_shop-dev.git jaffle_shop +cd jaffle_shop +mkdir target +chmod 777 target +echo '' > target/index.html +chmod o+w target/index.html +``` + +3. Create the `airflowtest` and `jaffle_shop` users/databases on your Teradata database by using your preferred database tool (Teradata Studio Express, `bteq` or similar). Log into the database as `dbc`, then execute the commands (change the passwords if needed): + +``` sql , id="create_databases", role="content-editable emits-gtm-events" +CREATE USER "airflowtest" FROM "dbc" AS PERM=5000000000 PASSWORD="abcd"; +CREATE USER "jaffle_shop" FROM "dbc" AS PERM=5000000000 PASSWORD="abcd"; +``` + +4. Create the dbt configuration directory: + +``` bash , id="create_dbt_config_dir", role="content-editable emits-gtm-events" +cd +mkdir .dbt +``` + +5. Copy [profiles.yml](./attachments/execute-airflow-workflows-that-use-dbt-with-teradata-vantage/profiles.yml) into the `.dbt` directory. + +6. Edit the file so it corresponds to your Teradata database setup. At a minium, you will need to change the host, user and password. Use `jaffle_shop` user credentials you set up in step 3. + +### Create the Airflow environment in Docker + +1. Run the docker environment creation script in the `airflow` directory where `Dockerfile` and `docker-compose.yaml`: + +``` bash , id="run_docker_compose", role="content-editable emits-gtm-events" +cd ~/airflow +sudo docker-compose up --build +``` + +This can take 5-10 minutes, when the installation is complete you should see on the screen a message similar to this: + +``` bash , id="run_docker_compose_response", role="content-editable emits-gtm-events" +airflow-webserver_1 | 127.0.0.1 - - [13/Sep/2022:00:20:48 +0000] "GET /health HTTP/1.1" 200 187 "-" "curl/7.74.0" +``` + +This means the Airflow webserver is ready to accept calls. + +2. Now Airflow should be up. The terminal session that we were using during the installation will be used to display log messages, so it is recommended +to open another terminal session for subsequent steps. To check the Airflow installation type: + +``` bash , id="check_airflow_in_docker", role="content-editable emits-gtm-events" +sudo docker ps +``` + +The result should be something like: + +``` bash , id="check_airflow_in_docker_output", role="content-editable emits-gtm-events" +CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES +60d50d9f43f5 apache/airflow:2.2.4 "/usr/bin/dumb-init …" 18 minutes ago Up 18 minutes (healthy) 8080/tcp airflow_airflow-scheduler_1 +e2b46ec98274 apache/airflow:2.2.4 "/usr/bin/dumb-init …" 18 minutes ago Up 18 minutes (healthy) 8080/tcp airflow_airflow-worker_3_1 +7b44004c7277 apache/airflow:2.2.4 "/usr/bin/dumb-init …" 18 minutes ago Up 18 minutes (healthy) 8080/tcp airflow_airflow-worker_1_1 +4017b8ce9235 apache/airflow:2.2.4 "/usr/bin/dumb-init …" 18 minutes ago Up 18 minutes (healthy) 0.0.0.0:8080->8080/tcp, :::8080->8080/tcp airflow_airflow-webserver_1 +3cc407e2d565 apache/airflow:2.2.4 "/usr/bin/dumb-init …" 18 minutes ago Up 18 minutes (healthy) 0.0.0.0:5555->5555/tcp, :::5555->5555/tcp, 8080/tcp airflow_flower_1 +340a83b202e3 apache/airflow:2.2.4 "/usr/bin/dumb-init …" 18 minutes ago Up 18 minutes (healthy) 8080/tcp airflow_airflow-triggerer_1 +82198f0d8b84 apache/airflow:2.2.4 "/usr/bin/dumb-init …" 18 minutes ago Up 18 minutes (healthy) 8080/tcp airflow_airflow-worker_2_1 +382c3077c1e5 redis:latest "docker-entrypoint.s…" 18 minutes ago Up 18 minutes (healthy) 6379/tcp airflow_redis_1 +8a3be8d8a7f4 nginx "/docker-entrypoint.…" 18 minutes ago Up 18 minutes (healthy) 0.0.0.0:4000->80/tcp, :::4000->80/tcp airflow_nginx_1 +9ca888e9e8df postgres:13 "docker-entrypoint.s…" 18 minutes ago Up 18 minutes (healthy) 5432/tcp airflow_postgres_1 +``` + +3. OPTIONAL: If you want to delete the docker installation (for example to update the docker-compose.yaml and the Dockerfile files and recreate a different environment), the command is (from the airflow directory where these files are located): + +``` bash , id="docker_compose_down", role="content-editable emits-gtm-events" +sudo docker-compose down --volumes --rmi all +``` + +Once the stack is down, update the configuration files and restart by running the command in step 1. + + +4. To test if the Airflow web UI works, type the following urls on your browser. Replace `` with the external IP address of the VM: + * DAG UI: `http://:8080/home` - username: airflow / password: airflow + * Flower Airflow UI (worker control): `http://:5555/` + +### Run an Airflow DAG + +1. Copy [airflow_dbt_integration.py](./attachments/execute-airflow-workflows-that-use-dbt-with-teradata-vantage/airflow_dbt_integration.py), [db_test_example_dag.py](./attachments/execute-airflow-workflows-that-use-dbt-with-teradata-vantage/db_test_example_dag.py), [discover_dag.txt](./attachments/execute-airflow-workflows-that-use-dbt-with-teradata-vantage/discover_dag.txt), [variables.json](./attachments/execute-airflow-workflows-that-use-dbt-with-teradata-vantage/variables.json) files to `/home/ec2-user/airflow/dags`. +2. Examine the files: +* `airflow_dbt_integration.py` - a simple Teradata sql example that creates a few tables and runs queries. +* `db_test_example_dag.py` - runs a dbt example [i.e. integration of dbt and airflow with a Teradata database). In this example a fictitious jaffle_shop data model is created, loaded and the documentation for this project is produced (you can view it by pointing your browser to `http://:4000/`) + +:::note +[Adjust `db_test_example_dag.py`] +`db_test_example_dag.py` needs to be updated so that the Teradata database IP address points to your database. +::: + +* `discover_dag.py` - an example on how to load various types of data files (CSV, Parquet, JSON). The source code file contains comments that explain what the program does and how to use it. This example relies on `variables.json` file. The file needs to be imported into Airflow. It will happen in subsequent steps. + +3. Wait for a few minutes until these dag files are picked up by the airflow tool. Once they are picked up they will appear on the list of dags on the Airflow home page. + +4. Import `variables.json` file as a variable file into Airflow: +* Click on `Admin -> Variables` menu item to go to the Variables page +![Airflow admin dropdown](../other-integrations/images/execute-airflow-workflows-that-use-dbt-with-teradata-vantage/admin-dropdown.png) +* Click on `Choose File`, then select `variable.json` in your file explorer and click on `Import Variables` +![Airflow admin dropdown](../other-integrations/images/execute-airflow-workflows-that-use-dbt-with-teradata-vantage/import-variables.png) +* Edit the variables to match your environment + + +5. Run the dags from the UI and check the logs. + + + +## Summary + +This tutorial aimed at providing a hands on exercise on how to install an Airflow environment on a Linux server and how to use Airflow to interact with a Teradata Vantage database. An additional example is provided on how to integrate Airflow and the data modelling and maintenance tool dbt to create and load a Teradata Vantage database. + +## Further reading +* [Use dbt (data build tool) with Teradata Vantage](https://quickstarts.teradata.com/dbt.html#_install_dbt) + +import CommunityLinkPartial from '../_partials/community_link.mdx'; + + \ No newline at end of file diff --git a/quickstarts/manage-data/getting-started-dbt-feast-teradata-pipeline.md b/quickstarts/manage-data/getting-started-dbt-feast-teradata-pipeline.md new file mode 100644 index 0000000000..2b80d6d168 --- /dev/null +++ b/quickstarts/manage-data/getting-started-dbt-feast-teradata-pipeline.md @@ -0,0 +1,215 @@ +--- +sidebar_position: 7 +author: Ravi Chillanki +email: ravi.chillanki@teradata.com +page_last_update: August 4th, 2023 +description: dbt Feast integration with Teradata +keywords: [data warehouses, compute storage separation, teradata, vantage, cloud data platform, object storage, business intelligence, enterprise analytics, AI/ML, AI, ML, feature engineering, feature store, FEAST] +--- +import ClearscapeDocsNote from '../_partials/vantage_clearscape_analytics.mdx' + +# Use dbt and FEAST to build a feature store in Teradata Vantage + +## Overview + +This tutorial shows an approach to creating a dbt pipeline that takes raw data and turns it into FEAST features. The pipeline leverages '[ClearScape Analytics functions](https://docs.teradata.com/r/Enterprise_IntelliFlex_VMware/Teradata-VantageTM-Analytics-Database-Analytic-Functions-17.20)' for data transformations. The output of the transformations is loaded into FEAST to materialize features that can be used in ML models. + +## Introduction +### dbt +[dbt](https://www.getdbt.com/product/what-is-dbt) (Data Build Tool) is a data transformation tool that is the cornerstone of the Modern Data Stack. It takes care of the T in ELT (Extract Load Transform). The assumption is that some other process brings raw data into your data warehouse or lake. This data then needs to be transformed. + +### Feast +[Feast](https://docs.feast.dev) (Feature Store) is a flexible data system that utilizes existing technology to manage and provide machine learning features to real-time models. It allows for customization to meet specific needs. It also allows us to make features consistently available for training and serving, avoid data leakage and decouple ML from data infrastructure. + + +## Prerequisites + +* Access to a Teradata Vantage database instance. + + + +* Feast-Teradata installed locally. See [Feast-Teradata installation instructions](https://quickstarts.teradata.com/modelops/using-feast-feature-store-with-teradata-vantage.html#_overview) + +* dbt installed locally. See [dbt installation instructions](https://quickstarts.teradata.com/dbt.html) + +## Objective +The goal is to create a data pipeline with Teradata Vantage as a source, and perform data transformation on some variables in dbt. The principle transformation of data we do in dbt is the one-hot encoding of several columns like gender, marital status, state code, etc. On top of that, the account type column data will be transformed by performing aggregation operations on a couple of columns. All of this together generates the desired dataset with transformed data. The transformed dataset is used as an input into FEAST to store features. Features are then used to generate a training dataset for models. + + +## Getting started +1. Create a new python environment to manage dbt, feast, and their dependencies. Activate the environment: + +``` bash +python3 -m venv env +source env/bin/activate +``` + +2. Clone the tutorial repository and change the directory to the project directory: + +``` bash +git clone https://github.com/Teradata/tdata-pipeline.git +``` +The directory structure of the project cloned looks like this: + +``` +tdata-pipeline/ + feature_repo/ + feature_views.py + feature_store.yml + dbt_transformation/ + ... + macros + models + ... + generate_training_data.py + CreateDB.sql + dbt_project.yml +``` + + +## About the Banking warehouse +teddy_bank is a fictitious dataset of banking customers, consisting of mainly 3 tables customers, accounts, and +transactions, with the following entity-relationship diagram: + + +![dbt feast](../images/dbt3.svg) + +dbt takes this raw data and builds the following model, which is more suitable for ML modeling and analytics tools: + +![dbt feast](../other-integrations/images/getting-started-dbt-feast-teradata-pipeline/dbt-feast.png) + +## Configure dbt +Create file `$HOME/.dbt/profiles.yml` with the following content. Adjust ``, ``, `` to match your Teradata instance. + +:::note [Database setup] +The following dbt profile points to a database called `teddy_bank`. You can change `schema` value to point to an existing database in your Teradata Vantage instance: +::: + +``` yaml , id="dbt_first_config", role="emits-gtm-events" +dbt_transformation: + target: dev + outputs: + dev: + type: teradata + host: + user: + password: + schema: teddy_bank + tmode: ANSI +``` +Validate the setup: + +``` bash +dbt debug +``` + +If the debug command returned errors, you likely have an issue with the content of `profiles.yml`. + +## Configure FEAST +Feast configuration addresses connection to your Vantage database. The yaml file created while initializing the feast +project, `$HOME/.feast/feature_repo/feature_store.yml` can hold the details of offline storage, online storage, provider +and registry. Adjust ``, ``, `` to match your Teradata instance. + +:::note [Database setup] +The following dbt profile points to a database called `teddy_bank`. You can change `schema` value to point to an +existing database in your Teradata Vantage instance +::: + +### Offline Store Config + +``` yaml , id="feast_first_config", role="emits-gtm-events" +project: td_pipeline +registry: + registry_type: sql + path: teradatasql://:@/?database=teddy_bank&LOGMECH=TDNEGO +provider: local +offline_store: + type: feast_teradata.offline.teradata.TeradataOfflineStore + host: + database: teddy_bank + user: + password: + log_mech: TDNEGO +entity_key_serialization_version: 2 +``` +### Syntax for Teradata SQL Registry +``` python +path = 'teradatasql://'+ teradata_user +':' + teradata_password + '@'+host + '/?database=' + + teradata_database + '&LOGMECH=' + teradata_log_mech +``` +## Run dbt +In this step, we will populate the following data tables: `customers`, `accounts`, and `transactions`. + +``` bash +dbt seed +``` + +### Create the dimensional model +Now that we have the raw data tables, we can instruct dbt to create the dimensional model: + +``` bash +dbt run --select Analytic_Dataset +``` + +## Run FEAST +### Feature Repository definition +* `TeradataSource:` Data Source for features stored in Teradata (Enterprise or Lake) or accessible via a Foreign Table from Teradata (NOS, QueryGrid) + +* `Entity:` A collection of semantically related features + +* `Feature View:` A feature view is a group of feature data from a specific data source. Feature views allow you to consistently define features and their data sources, enabling the reuse of feature groups across a project + + +``` python +DBT_source = TeradataSource( database=dbload, table=f"Analytic_Dataset", timestamp_field="event_timestamp") + +customer = Entity(name = "customer", join_keys = ['cust_id']) + +ads_fv = FeatureView(name="ads_fv",entities=[customer],source=DBT_source, schema=[ + Field(name="age", dtype=Float32), + Field(name="income", dtype=Float32), + Field(name="q1_trans_cnt", dtype=Int64), + Field(name="q2_trans_cnt", dtype=Int64), + Field(name="q3_trans_cnt", dtype=Int64), + Field(name="q4_trans_cnt", dtype=Int64), + ],) +``` +### Generate training data +The approach to generating training data can vary. Depending upon the requirements, 'entitydf' may be joined with the source data tables using the feature views mapping. Here is a sample function that generates a training dataset. +``` python +def get_Training_Data(): + # Initialize a FeatureStore with our current repository's configurations + store = FeatureStore(repo_path="feature_repo") + con = create_context(host = os.environ["latest_vm"], username = os.environ["dbc_pwd"], + password = os.environ["dbc_pwd"], database = "EFS") + entitydf = DataFrame('Analytic_Dataset').to_pandas() + entitydf.reset_index(inplace=True) + print(entitydf) + entitydf = entitydf[['cust_id','event_timestamp']] + training_data = store.get_historical_features( + entity_df=entitydf, + features=[ + "ads_fv:age" + ,"ads_fv:income" + ,"ads_fv:q1_trans_cnt" + ,"ads_fv:q2_trans_cnt" + ,"ads_fv:q3_trans_cnt" + ,"ads_fv:q4_trans_cnt" + ], + full_feature_names=True + ).to_df() + + return training_data + + +``` + +## Summary +This tutorial demonstrated how to use dbt and FEAST with Teradata Vantage. The sample project takes raw data from Teradata Vantage and produces features with dbt. Metadata of features that form the base to generate a training dataset for a model was then created with FEAST; all its corresponding tables that create the feature store, are also generated at runtime within the same database. + +## Further Reading +* [dbt documentation](https://docs.getdbt.com/docs) +* [dbt-teradata plugin documentation](https://github.com/Teradata/dbt-teradata) +* [Feast Scalable Registry](https://docs.feast.dev/tutorials/using-scalable-registry) +* [Enabling highly scalable feature store with Teradata Vantage and FEAST](https://medium.com/teradata/enabling-highly-scalable-feature-store-with-teradata-vantage-and-feast-e01008fa8fdb) +* [Git repository](https://github.com/Teradata/tdata-pipeline) for this project. diff --git a/quickstarts/manage-data/ingest-catalog-data-teradata-s3-with-glue.md b/quickstarts/manage-data/ingest-catalog-data-teradata-s3-with-glue.md new file mode 100644 index 0000000000..3b20d02846 --- /dev/null +++ b/quickstarts/manage-data/ingest-catalog-data-teradata-s3-with-glue.md @@ -0,0 +1,276 @@ +--- +sidebar_position: 19 +author: Daniel Herrera +email: daniel.herrera2@teradata.com +page_last_update: March 18, 2024 +description: Ingest and catalog data from Teradata Vantage to Amazon S3 +keywords: [data warehouses, object storage, teradata, vantage, cloud data platform, data engineering, enterprise analytics, aws glue, aws lake formation, aws glue catalog] +--- + +# Ingest and Catalog Data from Teradata Vantage to Amazon S3 with AWS Glue Scripts + +### Overview +This quickstart details the process of ingesting and cataloging data from Teradata Vantage to Amazon S3 with AWS Glue. + +:::tip +For ingesting data to Amazon S3 when cataloging is not a requirement consider [Teradata Write NOS capabilities](https://docs.teradata.com/r/Enterprise_IntelliFlex_VMware/SQL-Data-Manipulation-Language/Working-with-External-Data/WRITE_NOS). +::: + + +### Prerequisites + +import ClearscapeDocsNote from '../_partials/vantage_clearscape_analytics.mdx' + +* Access to an [Amazon AWS account](https://aws.amazon.com) +* Access to a Teradata Vantage instance + +* A database [client](https://quickstarts.teradata.com/other-integrations/configure-a-teradata-vantage-connection-in-dbeaver.html) to send queries for loading the test data + +### Loading of test data +* In your favorite database client run the following queries ++ +``` sql +CREATE DATABASE teddy_retailers_inventory +AS PERMANENT = 110e6; + +CREATE TABLE teddy_retailers_inventory.source_catalog AS +( + SELECT product_id, product_name, product_category, price_cents + FROM ( + LOCATION='/s3/dev-rel-demos.s3.amazonaws.com/demo-datamesh/source_products.csv') as products +) WITH DATA; + +CREATE TABLE teddy_retailers_inventory.source_stock AS +( + SELECT entry_id, product_id, product_quantity, purchase_price_cents, entry_date + FROM ( + LOCATION='/s3/dev-rel-demos.s3.amazonaws.com/demo-datamesh/source_stock.csv') as stock +) WITH DATA; +``` + +### Amazon AWS setup +In this section, we will cover in detail each of the steps below: + +* Create an Amazon S3 bucket to ingest data +* Create an AWS Glue Catalog Database for storing metadata +* Store Teradata Vantage credentials in AWS Secrets Manager +* Create an AWS Glue Service Role to assign to ETL jobs +* Create a connection to a Teradata Vantage Instance in AWS Glue +* Create an AWS Glue Job +* Draft a script for automated ingestion and cataloging of Teradata Vantage data into Amazon S3 + +### Create an Amazon S3 Bucket to Ingest Data +* In Amazon S3, select `Create bucket`. +![create bucket](../cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Buckets-1.PNG) +* Assign a name to your bucket and take note of it. +![name bucket](../cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Buckets-2.PNG) +* Leave all settings at their default values. +* Click on `Create bucket`. +![save bucket](../cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Buckets-3.PNG) + +### Create an AWS Glue Catalog Database for Storing Metadata + +* In AWS Glue, select Data catalog, Databases. +* Click on `Add database`. +![add database](../cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Cat-1.PNG) +* Define a database name and click on `Create database`. +![add database name](../cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Cat-2.PNG) + +### Store Teradata Vantage credentials in AWS Secrets Manager + +* In AWS Secrets Manager, select `Create new secret`. +![create secret](../cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/secret-1.PNG) +* The secret should be an `Other type of secret` with the following keys and values according to your Teradata Vantage Instance: + * USER + * PASSWORD +:::tip +In the case of ClearScape Analytics Experience, the user is always "demo_user," and the password is the one you defined when creating your ClearScape Analytics Experience environment. +::: +![secret values](../cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/secret-2.PNG) +* Assign a name to the secret. +* The rest of the steps can be left with the default values. +* Create the secret. + +### Create an AWS Glue Service Role to Assign to ETL Jobs +The role you create should have access to the typical permissions of a Glue Service Role, but also access to read the secret and S3 bucket you've created. + +* In AWS, go to the IAM service. +* Under Access Management, select `Roles`. +* In roles, click on `Create role`. +![create role](../cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Role-1.PNG) +* In select trusted entity, select `AWS service` and pick `Glue` from the dropdown. +![role type](../cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Role-2.PNG) +* In add permissions: + * Search for `AWSGlueServiceRole`. + * Click the related checkbox. + * Search for `SecretsManagerReadWrite`. + * Click the related checkbox. +* In Name, review, and create: + * Define a name for your role. +![name role](../cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Role-3.PNG) +* Click on `Create role`. +* Return to Access Management, Roles, and search for the role you've just created. +* Select your role. +* Click on `Add permissions`, then `Create inline policy`. +* Click on `JSON`. +* In the Policy editor, paste the JSON object below, substituting the name of the bucket you've created. +``` json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "FullAccessToSpecificBucket", + "Effect": "Allow", + "Action": "s3:*", + "Resource": [1 + "arn:aws:s3:::", + "arn:aws:s3:::/*" + ] + } + ] +} +``` +* Click `Next`. +![inline policy](../cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Role-4.PNG) +* Assign a name to your policy. +* Click on `Create policy`. + +### Create a connection to a Teradata Vantage Instance in AWS Glue + +* In AWS Glue, select `Data connections`. +![connection](../cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Glue-1.PNG) +* Under Connectors, select `Create connection`. +* Search for and select the Teradata Vantage data source. +![teradata type](../cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Glue-2.PNG) +* In the dialog box, enter the URL of your Teradata Vantage instance in JDBC format. +:::tip +In the case of ClearScape Analytics Experience, the URL follows the following structure: +`jdbc:teradata:///DATABASE=demo_user,DBS_PORT=1025` +::: +* Select the AWS Secret created in the previous step. +* Name your connection and finish the creation process. +![connection configuration](../cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Glue-3.PNG) + +### Create an AWS Glue Job +* In AWS Glue, select `ETL Jobs` and click on `Script editor`. +![script editor creation](../cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Glue-script-1.PNG) +* Select `Spark` as the engine and choose to start fresh. +![script editor type](../cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Glue-script-2.PNG) + +### Draft a script for automated ingestion and cataloging of Teradata Vantage data into Amazon S3 + +* Copy the following script into the editor. + * The script requires the following modifications: + * Substitute the name of your S3 bucket. + * Substitute the name of your Glue catalog database. + * If you are not following the example in the guide, modify the database name and the tables to be ingested and cataloged. + * For cataloging purposes, only the first row of each table is ingested in the example. This query can be modified to ingest the whole table or to filter selected rows. + +``` python , id="glue-script-first-run" role="emits-gtm-events" +# Import section +import sys +from awsglue.transforms import * +from awsglue.utils import getResolvedOptions +from pyspark.context import SparkContext +from awsglue.context import GlueContext +from awsglue.job import Job +from pyspark.sql import SQLContext + +# PySpark Config Section +args = getResolvedOptions(sys.argv, ["JOB_NAME"]) +sc = SparkContext() +glueContext = GlueContext(sc) +spark = glueContext.spark_session +job = Job(glueContext) +job.init(args["JOB_NAME"], args) + +#ETL Job Parameters Section +# Source database +database_name = "teddy_retailers_inventory" + +# Source tables +table_names = ["source_catalog","source_stock"] + +# Target S3 Bucket +target_s3_bucket = "s3://" + +#Target catalog database +catalog_database_name = "" + + +# Job function abstraction +def process_table(table_name, transformation_ctx_prefix, catalog_database, catalog_table_name): + dynamic_frame = glueContext.create_dynamic_frame.from_options( + connection_type="teradata", + connection_options={ + "dbtable": table_name, + "connectionName": "Teradata connection default", + "query": f"SELECT TOP 1 * FROM {table_name}", # This line can be modified to ingest the full table or rows that fulfill an specific condition + }, + transformation_ctx=transformation_ctx_prefix + "_read", + ) + + s3_sink = glueContext.getSink( + path=target_s3_bucket, + connection_type="s3", + updateBehavior="UPDATE_IN_DATABASE", + partitionKeys=[], + compression="snappy", + enableUpdateCatalog=True, + transformation_ctx=transformation_ctx_prefix + "_s3", + ) + # Dynamically set catalog table name based on function parameter + s3_sink.setCatalogInfo( + catalogDatabase=catalog_database, catalogTableName=catalog_table_name + ) + s3_sink.setFormat("csv") + s3_sink.writeFrame(dynamic_frame) + + +# Job execution section +for table_name in table_names: + full_table_name = f"{database_name}.{table_name}" + transformation_ctx_prefix = f"{database_name}_{table_name}" + catalog_table_name = f"{table_name}_catalog" + # Call your process_table function for each table + process_table(full_table_name, transformation_ctx_prefix, catalog_database_name, catalog_table_name) + +job.commit() +``` + +* Assign a name to your script +![script in editor](../cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Glue-script-3.PNG) + +* In Job details, Basic properties: + * Select the IAM role you created for the ETL job. + * For testing, select "2" as the Requested number of workers, this is the minimum allowed. +![script configurations](../cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Glue-script-4.PNG) + * In `Advanced properties`, `Connections` select your connection to Teradata Vantage. +:::tip +The connection created must be referenced twice, once in the job configuration, once in the script itself. +::: +![script configuration connection](../cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Glue-script-5.PNG) +* Click on `Save`. +* Click on `Run`. + * The ETL job takes a couple of minutes to complete, most of this time is related to starting the Spark cluster. + +### Checking the Results + +* After the job is finished: + * Go to Data Catalog, Databases. + * Click on the catalog database you created. + * In this location, you will see the tables extracted and cataloged through your Glue ETL job. +![result tables](../cloud-guides/images/ingest-catalog-data-teradata-s3-with-glue/Results.PNG) + +* All tables ingested are also present as compressed files in S3. Rarely, these files would be queried directly. Services such as AWS Athena can be used to query the files relying on the catalog metadata. + +### Summary + +In this quick start, we learned how to ingest and catalog data in Teradata Vantage to Amazon S3 with AWS Glue Scripts. + +### Further reading +* [Integrate Teradata Vantage with Google Cloud Data Catalog](https://quickstarts.teradata.com/cloud-guides/integrate-teradata-vantage-with-google-cloud-data-catalog.html) + +import CommunityLinkPartial from '../_partials/community_link.mdx'; + + \ No newline at end of file diff --git a/quickstarts/manage-data/integrate-teradata-vantage-to-salesforce-using-amazon-appflow.md b/quickstarts/manage-data/integrate-teradata-vantage-to-salesforce-using-amazon-appflow.md new file mode 100644 index 0000000000..5047907f75 --- /dev/null +++ b/quickstarts/manage-data/integrate-teradata-vantage-to-salesforce-using-amazon-appflow.md @@ -0,0 +1,694 @@ +--- +sidebar_position: 14 +author: Wenjie Tehan +email: wenjie.tehan@teradata.com +page_last_update: February 14th, 2022 +description: Connect Teradata Vantage to Salesforce using Amazon Appflow. +keywords: [data warehouses, compute storage separation, teradata, vantage, cloud data platform, object storage, business intelligence, enterprise analytics, salesforce integration.] +--- + +# Connect Teradata Vantage to Salesforce using Amazon Appflow + +### Overview + +This how-to describes the process to migrate data between Salesforce and Teradata Vantage. It contains two use cases: + +1. Retrieve customer information from Salesforce, and combine it with order and shipping information from Vantage to derive analytical insights. +2. Update `newleads` table on Vantage with the Salesforce data, then add the new lead(s) back to Salesforce using AppFlow. + +![Diagram Description automatically generated](../cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image2.png) + +Amazon AppFlow transfers the customer account data from Salesforce to Amazon S3. Vantage then uses Native Object Store (NOS) read functionality to join the data in Amazon S3 with data in Vantage with a single query. + +The account information is used to update the `newleads` table on Vantage. Once the table is updated, Vantage writes it back to the Amazon S3 bucket with NOS Write. A Lambda function is triggered upon arrival of the new lead data file to convert the data file from Parquet format to CSV format, and AppFlow then inserts the new lead(s) back into Salesforce. + +### About Amazon AppFlow + +Amazon AppFlow is a fully managed integration service that enables users to securely transfer data between Software-as-a-Service (SaaS) applications like Salesforce, Marketo, Slack, and ServiceNow, and AWS services like Amazon S3 and Amazon Redshift. AppFlow automatically encrypts data in motion, and allows users to restrict data from flowing over the public internet for SaaS applications that are integrated with AWS PrivateLink, reducing exposure to security threats. + +As of today, Amazon AppFlow has 16 sources to choose from, and can send the data to four destinations. + +### About Teradata Vantage + +Teradata Vantage is the connected multi-cloud data platform for enterprise analytics, solving data challenges from start to scale. + +Vantage enables companies to start small and elastically scale compute or storage, paying only for what they use, harnessing low-cost object stores and integrating their analytic workloads. Vantage supports R, Python, Teradata Studio, and any other SQL-based tools. + +Vantage combines descriptive, predictive, prescriptive analytics, autonomous decision-making, ML functions, and visualization tools into a unified, integrated platform that uncovers real-time business intelligence at scale, no matter where the data resides. + +Teradata Vantage Native Object Store (NOS) can be used to explore data in external object stores, like Amazon S3, using standard SQL. No special object storage-side compute infrastructure is required to use NOS. Users can explore data located in an Amazon S3 bucket by simply creating a NOS table definition that points to your bucket. With NOS, you can quickly import data from Amazon S3 or even join it with other tables in the Vantage database. + +### Prerequisites + +You are expected to be familiar with Amazon AppFlow service and Teradata Vantage. + +You will need the following accounts, and systems: + +import ClearscapeDocsNote from '../_partials/vantage_clearscape_analytics.mdx' + +* Access to a Teradata Vantage instance. + +* An AWS account with the role that can create and run flows. +* An Amazon S3 bucket to store Salesforce data (i.e., ptctsoutput) +* An Amazon S3 bucket to store raw Vantage data (Parquet file) (i.e., vantageparquet). This bucket needs to have policy to allow Amazon AppFlow access +* An Amazon S3 bucket to store converted Vantage data (CSV file) (i.e., vantagecsv) +* A Salesforce account that satisfies the following requirements: + * Your Salesforce account must be enabled for API access. API access is enabled by default for Enterprise, Unlimited, Developer, and Performance editions. + * Your Salesforce account must allow you to install connected apps. If this is disabled, contact your Salesforce administrator. After you create a Salesforce connection in Amazon AppFlow, verify that the connected app named "Amazon AppFlow Embedded Login App" is installed in your Salesforce account. + * The refresh token policy for the "Amazon AppFlow Embedded Login App" must be set to "Refresh token is valid until revoked". Otherwise, your flows will fail when your refresh token expires. + * You must enable Change Data Capture in Salesforce to use event-driven flow triggers. From Setup, enter "Change Data Capture" in Quick Find. + * If your Salesforce app enforces IP address restrictions, you must whitelist the addresses used by Amazon AppFlow. For more information, see [AWS IP address ranges](https://docs.aws.amazon.com/general/latest/gr/aws-ip-ranges.html) in the _Amazon Web Services General Reference_. + * If you are transferring over 1 million Salesforce records, you cannot choose any Salesforce compound field. Amazon AppFlow uses Salesforce Bulk APIs for the transfer, which does not allow transfer of compound fields. + * To create private connections using AWS PrivateLink, you must enable both "Manager Metadata" and "Manage External Connections" user permissions in your Salesforce account. Private connections are currently available in the us-east-1 and us-west-2 AWS Regions. + * Some Salesforce objects can't be updated, such as history objects. For these objects, Amazon AppFlow does not support incremental export (the "Transfer new data only" option) for schedule-triggered flows. Instead, you can choose the "Transfer all data" option and then select the appropriate filter to limit the records you transfer. + +### Procedure + +Once you have met the prerequisites, follow these steps: + + +1. Create a Salesforce to Amazon S3 Flow +2. Exploring Data using NOS +3. Export Vantage Data to Amazon S3 using NOS +4. Create an Amazon S3 to Salesforce Flow + +### Create a Salesforce to Amazon S3 Flow + +This step creates a flow using Amazon AppFlow. For this example, we're using a [Salesforce developer account](https://developer.salesforce.com/signup) to connect to Salesforce. + +Go to [AppFlow console](https://console.aws.amazon.com/appflow), sign in with your AWS login credentials and click *Create flow*. Make sure you are in the right region, and the bucket is created to store Salesforce data. + +![A screenshot of a social media post Description automatically generated](../cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image3.png) + +#### Step 1: Specify flow details + +This step provides basic information for your flow. + +Fill in **Flow name** (i.e. _salesforce_) and **Flow description (optional)**, leave **Customize encryption settings (advanced)** unchecked. Click **Next**. + +#### Step 2: Configure flow + +This step provides information about the source and destination for your flow. For this example, we will be using **_Salesforce_** as the source, and **_Amazon S3_** as the destination. + +* For *Source name*, choose _Salesforce_, then *_Create new connection_* for *Choose Salesforce connection*. +![A screenshot of a cell phone Description automatically generated](../cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image4.png) + +* Use default for **Salesforce environment** and **Data encryption**. Give your connection a name (i.e. _salesforce_) and click **Continue**. +![A screenshot of a cell phone Description automatically generated](../cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image5.png) + +* At the salesforce login window, enter your **Username** and **Password**. Click **Log In** +![A screenshot of a cell phone Description automatically generated](../cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image6.png) + +* Click **Allow** to allow AppFlow to access your salesforce data and information. +![A screenshot of a cell phone Description automatically generated](../cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image7.png) + +* Back at the AppFlow **Configure flow** window, use **Salesforce objects**, and choose _Account_ to be the Salesforce object. +![A screenshot of a cell phone Description automatically generated,](../cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image8.png) +![A screenshot of a cell phone Description automatically generated](../cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image9.png) + +* Use _Amazon S3_ as **Destination name**. Pick the bucket you created [earlier](#prerequisites) where you want the data to be stored (i.e., _ptctsoutput_). +![A screenshot of a cell phone Description automatically generated](../cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image10.png) + +* **Flow trigger** is _Run on demand_. Click **Next**. +![A screenshot of a cell phone Description automatically generated](../cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image11.png) + +#### Step 3: Map data fields + +This step determines how data is transferred from the source to the destination. + +* Use _Manually map fields_ as **Mapping method** +* For simplicity, choose _Map all fields directly_ for **Source to destination filed mapping**. +![A screenshot of a cell phone Description automatically generated](../cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image12.png) + +Once you click on "_Map all fields directly_", all the fields will show under **Mapped fields**. Click on the checkbox for the field(s) you want to **Add formula (concatenates)**, **Modify values (mask or truncate field values)**, or **Remove selected mappings**. + +For this example, no checkbox will be ticked. + +* For **_Validations_**, add in a condition to ignore the record that contains no "_Billing Address_" (optional). Click **Next**. +![A screenshot of a cell phone Description automatically generated](../cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image13.png) + +#### Step 4: Add filters + +You can specify a filter to determine which records to transfer. For this example, add a condition to filter out the records that are deleted (optional). Click **Next**. + +![A screenshot of a cell phone Description automatically generated](../cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image14.png) + +#### Step 5. Review and create + +Review all the information you just entered. Modify if necessary. Click **Create flow**. + +A message of successful flow creation will be displayed with the flow information once the flow is created, + +![A screenshot of a cell phone Description automatically generated](../cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image15.png) + +#### Run flow + +Click **Run flow** on the upper right corner. + +Upon completion of the flow run, message will be displayed to indicate a successful run. + +Message example: + +![](../cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image16.png) + +Click the link to the bucket to view data. Salesforce data will be in JSON format. + +#### Change data file properties + +By default, Salesforce data is encrypted. We need to remove the encryption for NOS to access it. + +Click on the data file in your Amazon S3 bucket, then click the **Properties** tab. + +![A screenshot of a social media post Description automatically generated](../cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image17.png) + +Click on the _AWS-KMS_ from **Encryption** and change it from _AWS-KMS_ encryption to _None_. Click **Save**. + +![A screenshot of a social media post Description automatically generate](../cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image18.png) + +### Exploring Data Using NOS + +Native Object Store has built in functionalities to explore and analyze data in Amazon S3. This section lists a few commonly used functions of NOS. + +#### Create Foreign Table + +Foreign table allows the external data to be easily referenced within the Vantage Advanced SQL Engine and makes the data available in a structured relational format. + +To create a foreign table, first login to Teradata Vantage system with your credentials. Create AUTHORIZATION object with access keys for Amazon S3 bucket access. Authorization object enhances security by establishing control over who is allowed to use a foreign table to access Amazon S3 data. + +``` sql +CREATE AUTHORIZATION DefAuth_S3 +AS DEFINER TRUSTED +USER 'A*****************' /* AccessKeyId */ +PASSWORD '********'; /* SecretAccessKey */ +``` + +"USER" is the AccessKeyId for your AWS account, and "PASSWORD" is the SecretAccessKey. + +Create a foreign table against the JSON file on Amazon S3 using following command. + +``` sql +CREATE MULTISET FOREIGN TABLE salesforce, +EXTERNAL SECURITY DEFINER TRUSTED DefAuth_S3 +( + Location VARCHAR(2048) CHARACTER SET UNICODE CASESPECIFIC, + Payload JSON(8388096) INLINE LENGTH 32000 CHARACTER SET UNICODE +) +USING +( + LOCATION ('/S3/s3.amazonaws.com/ptctstoutput/salesforce/1ce190bc-25a9-4493-99ad-7497b497a0d0/903790813-2020-08-21T21:02:25') +); +``` + +At a minimum, the foreign table definition must include a table name and location clause (highlighted in yellow) which points to the object store data. The Location requires a top-level single name, referred to as a "bucket" in Amazon. + +If the file name doesn't have standard extension (.json, .csv, .parquet) at the end, the Location and Payload columns definition is also required (highlighted in turquoise) to indicate the type of the data file. + +Foreign tables are always defined as No Primary Index (NoPI) tables. + +Once foreign table's created, you can query the content of the Amazon S3 data set by doing "Select" on the foreign table. + +``` sql +SELECT * FROM salesforce; +SELECT payload.* FROM salesforce; +``` + +The foreign table only contains two columns: Location and Payload. Location is the address in the object store system. The data itself is represented in the payload column, with the payload value within each record in the foreign table representing a single JSON object and all its name-value pairs. + +Sample output from "SELECT * FROM salesforce;". + +![A picture containing monitor Description automatically generated](../cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image19.png) + +Sample output form "SELECT payload.* FROM salesforce;". + +![A screenshot of a cell phone Description automatically generated](../cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image20.png) + +#### JSON_KEYS Table Operator + +JSON data may contain different attributes in different records. To determine the full list of possible attributes in a data store, use JSON_KEYS: + +``` sql +|SELECT DISTINCT * FROM JSON_KEYS (ON (SELECT payload FROM salesforce)) AS j; +``` + +Partial Output: + +![A screenshot of a cell phone Description automatically generated](../cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image21.png) + +#### Create View + +Views can simplify the names associated with the payload attributes, make it easier to code executable SQL against object store data, and hide the Location references in the foreign table to make it look like normal columns. + +Following is a sample create view statement with the attributes discovered from the JSON_KEYS table operator above. + +``` sql +REPLACE VIEW salesforceView AS ( + SELECT + CAST(payload.Id AS VARCHAR(20)) Customer_ID, + CAST(payload."Name" AS VARCHAR(100)) Customer_Name, + CAST(payload.AccountNumber AS VARCHAR(10)) Acct_Number, + CAST(payload.BillingStreet AS VARCHAR(20)) Billing_Street, + CAST(payload.BillingCity AS VARCHAR(20)) Billing_City, + CAST(payload.BillingState AS VARCHAR(10)) Billing_State, + CAST(payload.BillingPostalCode AS VARCHAR(5)) Billing_Post_Code, + CAST(payload.BillingCountry AS VARCHAR(20)) Billing_Country, + CAST(payload.Phone AS VARCHAR(15)) Phone, + CAST(payload.Fax AS VARCHAR(15)) Fax, + CAST(payload.ShippingStreet AS VARCHAR(20)) Shipping_Street, + CAST(payload.ShippingCity AS VARCHAR(20)) Shipping_City, + CAST(payload.ShippingState AS VARCHAR(10)) Shipping_State, + CAST(payload.ShippingPostalCode AS VARCHAR(5)) Shipping_Post_Code, + CAST(payload.ShippingCountry AS VARCHAR(20)) Shipping_Country, + CAST(payload.Industry AS VARCHAR(50)) Industry, + CAST(payload.Description AS VARCHAR(200)) Description, + CAST(payload.NumberOfEmployees AS VARCHAR(10)) Num_Of_Employee, + CAST(payload.CustomerPriority__c AS VARCHAR(10)) Priority, + CAST(payload.Rating AS VARCHAR(10)) Rating, + CAST(payload.SLA__c AS VARCHAR(10)) SLA, + CAST(payload.AnnualRevenue AS VARCHAR(10)) Annual_Revenue, + CAST(payload."Type" AS VARCHAR(20)) Customer_Type, + CAST(payload.Website AS VARCHAR(100)) Customer_Website, + CAST(payload.LastActivityDate AS VARCHAR(50)) Last_Activity_Date + FROM salesforce +); +``` + +``` sql +SELECT * FROM salesforceView; +``` + +Partial output: + +![A picture containing computer Description automatically generated](../cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image22.png) + +#### READ_NOS Table Operator + +READ_NOS table operator can be used to sample and explore a percent of the data without having first defined a foreign table, or to view a list of the keys associated with all the objects specified by a Location clause. + +``` sql +SELECT top 5 payload.* +FROM READ_NOS ( + ON (SELECT CAST(NULL AS JSON CHARACTER SET Unicode)) +USING +LOCATION ('/S3/s3.amazonaws.com/ptctstoutput/salesforce/1ce190bc-25a9-4493-99ad-7497b497a0d0/903790813-2020-08-21T21:02:25') + ACCESS_ID ('A**********') /* AccessKeyId */ + ACCESS_KEY ('***********') /* SecretAccessKey */ + ) AS D +GROUP BY 1; +``` + +Output: + +![A screenshot of a cell phone Description automatically generate](../cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image23.png) + +#### Join Amazon S3 Data to In-Database Tables + +Foreign table can be joined with a table(s) in Vantage for further analysis. For example, ordering and shipping information are in Vantage in these three tables – Orders, Order_Items and Shipping_Address. + +DDL for Orders: + +``` sql +CREATE TABLE Orders ( + Order_ID INT NOT NULL, + Customer_ID VARCHAR(20) CHARACTER SET LATIN CASESPECIFIC, + Order_Status INT, + -- Order status: 1 = Pending; 2 = Processing; 3 = Rejected; 4 = Completed + Order_Date DATE NOT NULL, + Required_Date DATE NOT NULL, + Shipped_Date DATE, + Store_ID INT NOT NULL, + Staff_ID INT NOT NULL +) Primary Index (Order_ID); +``` + +DDL for Order_Items: + +``` sql +CREATE TABLE Order_Items( + Order_ID INT NOT NULL, + Item_ID INT, + Product_ID INT NOT NULL, + Quantity INT NOT NULL, + List_Price DECIMAL (10, 2) NOT NULL, + Discount DECIMAL (4, 2) NOT NULL DEFAULT 0 +) Primary Index (Order_ID, Item_ID); +``` + +DDL for Shipping_Address: + +``` sql +CREATE TABLE Shipping_Address ( + Customer_ID VARCHAR(20) CHARACTER SET LATIN CASESPECIFIC NOT NULL, + Street VARCHAR(100) CHARACTER SET LATIN CASESPECIFIC, + City VARCHAR(20) CHARACTER SET LATIN CASESPECIFIC, + State VARCHAR(15) CHARACTER SET LATIN CASESPECIFIC, + Postal_Code VARCHAR(10) CHARACTER SET LATIN CASESPECIFIC, + Country VARCHAR(20) CHARACTER SET LATIN CASESPECIFIC +) Primary Index (Customer_ID); +``` + +And the tables have following data: + +Orders: + +![](../cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image24.png) + +Order_Items: + +![](../cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image25.png) + +Shipping_Address: + +![](../cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image26.png) + +By joining the salesforce foreign table to the established database table Orders, Order_Items and Shipping_Address, we can retrieve customer's order information with customer's shipping information. + +``` sql +SELECT + s.payload.Id as Customer_ID, + s.payload."Name" as Customer_Name, + s.payload.AccountNumber as Acct_Number, + o.Order_ID as Order_ID, + o.Order_Status as Order_Status, + o.Order_Date as Order_Date, + oi.Item_ID as Item_ID, + oi.Product_ID as Product_ID, + sa.Street as Shipping_Street, + sa.City as Shipping_City, + sa.State as Shipping_State, + sa.Postal_Code as Shipping_Postal_Code, + sa.Country as Shipping_Country +FROM + salesforce s, Orders o, Order_Items oi, Shipping_Address sa +WHERE + s.payload.Id = o.Customer_ID + AND o.Customer_ID = sa.Customer_ID + AND o.Order_ID = oi.Order_ID +ORDER BY 1; +``` + +Results: + +![](../cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image27.png) + +#### Import Amazon S3 Data to Vantage + +Having a persistent copy of the Amazon S3 data can be useful when repetitive access of the same data is expected. NOS foreign table does not automatically make a persistent copy of the Amazon S3 data. A few approaches to capture the data in the database are described below: + +A "CREATE TABLE AS … WITH DATA" statement can be used with the foreign table definition acting as the source table. Use this approach you can selectively choose which attributes within the foreign table payload that you want to include in the target table, and what the relational table columns will be named. + +``` sql +CREATE TABLE salesforceVantage AS ( + SELECT + CAST(payload.Id AS VARCHAR(20)) Customer_ID, + CAST(payload."Name" AS VARCHAR(100)) Customer_Name, + CAST(payload.AccountNumber AS VARCHAR(10)) Acct_Number, + CAST(payload.BillingStreet AS VARCHAR(20)) Billing_Street, + CAST(payload.BillingCity AS VARCHAR(20)) Billing_City, + CAST(payload.BillingState AS VARCHAR(10)) Billing_State, + CAST(payload.BillingPostalCode AS VARCHAR(5)) Billing_Post_Code, + CAST(payload.BillingCountry AS VARCHAR(20)) Billing_Country, + CAST(payload.Phone AS VARCHAR(15)) Phone, + CAST(payload.Fax AS VARCHAR(15)) Fax, + CAST(payload.ShippingStreet AS VARCHAR(20)) Shipping_Street, + CAST(payload.ShippingCity AS VARCHAR(20)) Shipping_City, + CAST(payload.ShippingState AS VARCHAR(10)) Shipping_State, + CAST(payload.ShippingPostalCode AS VARCHAR(5)) Shipping_Post_Code, + CAST(payload.ShippingCountry AS VARCHAR(20)) Shipping_Country, + CAST(payload.Industry AS VARCHAR(50)) Industry, + CAST(payload.Description AS VARCHAR(200)) Description, + CAST(payload.NumberOfEmployees AS INT) Num_Of_Employee, + CAST(payload.CustomerPriority__c AS VARCHAR(10)) Priority, + CAST(payload.Rating AS VARCHAR(10)) Rating, + CAST(payload.SLA__c AS VARCHAR(10)) SLA, + CAST(payload."Type" AS VARCHAR(20)) Customer_Type, + CAST(payload.Website AS VARCHAR(100)) Customer_Website, + CAST(payload.AnnualRevenue AS VARCHAR(10)) Annual_Revenue, + CAST(payload.LastActivityDate AS DATE) Last_Activity_Date + FROM salesforce) +WITH DATA +NO PRIMARY INDEX; +``` + +* `SELECT* * *FROM* salesforceVantage;` partial results: + +![A screenshot of a computer Description automatically generated](../cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image28.png) + +An alternative to using foreign table is to use the READ_NOS table operator. This table operator allows you to access data directly from an object store without first building a foreign table. Combining READ_NOS with a CREATE TABLE AS clause to build a persistent version of the data in the database. + +``` sql +CREATE TABLE salesforceReadNOS AS ( + SELECT + CAST(payload.Id AS VARCHAR(20)) Customer_ID, + CAST(payload."Name" AS VARCHAR(100)) Customer_Name, + CAST(payload.AccountNumber AS VARCHAR(10)) Acct_Number, + CAST(payload.BillingStreet AS VARCHAR(20)) Billing_Street, + CAST(payload.BillingCity AS VARCHAR(20)) Billing_City, + CAST(payload.BillingState AS VARCHAR(10)) Billing_State, + CAST(payload.BillingPostalCode AS VARCHAR(5)) Billing_Post_Code, + CAST(payload.BillingCountry AS VARCHAR(20)) Billing_Country, + CAST(payload.Phone AS VARCHAR(15)) Phone, + CAST(payload.Fax AS VARCHAR(15)) Fax, + CAST(payload.ShippingStreet AS VARCHAR(20)) Shipping_Street, + CAST(payload.ShippingCity AS VARCHAR(20)) Shipping_City, + CAST(payload.ShippingState AS VARCHAR(10)) Shipping_State, + CAST(payload.ShippingPostalCode AS VARCHAR(5)) Shipping_Post_Code, + CAST(payload.ShippingCountry AS VARCHAR(20)) Shipping_Country, + CAST(payload.Industry AS VARCHAR(50)) Industry, + CAST(payload.Description AS VARCHAR(200)) Description, + CAST(payload.NumberOfEmployees AS INT) Num_Of_Employee, + CAST(payload.CustomerPriority__c AS VARCHAR(10)) Priority, + CAST(payload.Rating AS VARCHAR(10)) Rating, + CAST(payload.SLA__c AS VARCHAR(10)) SLA, + CAST(payload."Type" AS VARCHAR(20)) Customer_Type, + CAST(payload.Website AS VARCHAR(100)) Customer_Website, + CAST(payload.AnnualRevenue AS VARCHAR(10)) Annual_Revenue, + CAST(payload.LastActivityDate AS DATE) Last_Activity_Date + FROM READ_NOS ( + ON (SELECT CAST(NULL AS JSON CHARACTER SET Unicode)) + USING + LOCATION ('/S3/s3.amazonaws.com/ptctstoutput/salesforce/1ce190bc-25a9-4493-99ad-7497b497a0d0/903790813-2020-08-21T21:02:25') + ACCESS_ID ('A**********') /* AccessKeyId */ + ACCESS_KEY ('***********') /* SecretAccessKey */ + ) AS D +) WITH DATA; +``` + +Results from the `salesforceReadNOS` table: + +``` sql +SELECT * FROM salesforceReadNOS; +``` + +![A picture containing large, people, riding Description automatically generated](../cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image29.png) + +Another way of placing Amazon S3 data into a relational table is by "INSERT SELECT". Using this approach, the foreign table is the source table, while a newly created permanent table is the table to be inserted into. Contrary to the READ_NOS example above, this approach does require the permanent table be created beforehand. + +One advantage of the INSERT SELECT method is that you can change the target table's attributes. For example, you can specify that the target table be `MULTISET` or not, or you can choose a different primary index. + +``` sql +CREATE TABLE salesforcePerm, FALLBACK , +NO BEFORE JOURNAL, +NO AFTER JOURNAL, +CHECKSUM = DEFAULT, +DEFAULT MERGEBLOCKRATIO, +MAP = TD_MAP1 +( + Customer_Id VARCHAR(20) CHARACTER SET LATIN NOT CASESPECIFIC, + Customer_Name VARCHAR(100) CHARACTER SET LATIN NOT CASESPECIFIC, + Acct_Number VARCHAR(10) CHARACTER SET LATIN NOT CASESPECIFIC, + Billing_Street VARCHAR(20) CHARACTER SET LATIN NOT CASESPECIFIC, + Billing_City VARCHAR(20) CHARACTER SET LATIN NOT CASESPECIFIC, + Billing_State VARCHAR(10) CHARACTER SET LATIN NOT CASESPECIFIC, + Billing_Post_Code VARCHAR(5) CHARACTER SET LATIN NOT CASESPECIFIC, + Billing_Country VARCHAR(20) CHARACTER SET LATIN NOT CASESPECIFIC, + Phone VARCHAR(15) CHARACTER SET LATIN NOT CASESPECIFIC, + Fax VARCHAR(15) CHARACTER SET LATIN NOT CASESPECIFIC, + Shipping_Street VARCHAR(20) CHARACTER SET LATIN NOT CASESPECIFIC, + Shipping_City VARCHAR(20) CHARACTER SET LATIN NOT CASESPECIFIC, + Shipping_State VARCHAR(10) CHARACTER SET LATIN NOT CASESPECIFIC, + Shipping_Post_Code VARCHAR(5) CHARACTER SET LATIN NOT CASESPECIFIC, + Shipping_Country VARCHAR(20) CHARACTER SET LATIN NOT CASESPECIFIC, + Industry VARCHAR(50) CHARACTER SET LATIN NOT CASESPECIFIC, + Description VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC, + Num_Of_Employee INT, + Priority VARCHAR(10) CHARACTER SET LATIN NOT CASESPECIFIC, + Rating VARCHAR(10) CHARACTER SET LATIN NOT CASESPECIFIC, + SLA VARCHAR(10) CHARACTER SET LATIN NOT CASESPECIFIC, + Customer_Type VARCHAR(20) CHARACTER SET LATIN NOT CASESPECIFIC, + Customer_Website VARCHAR(100) CHARACTER SET LATIN NOT CASESPECIFIC, + Annual_Revenue VARCHAR(10) CHARACTER SET LATIN NOT CASESPECIFIC, + Last_Activity_Date DATE +) PRIMARY INDEX (Customer_ID); +``` + +``` sql +INSERT INTO salesforcePerm + SELECT + CAST(payload.Id AS VARCHAR(20)) Customer_ID, + CAST(payload."Name" AS VARCHAR(100)) Customer_Name, + CAST(payload.AccountNumber AS VARCHAR(10)) Acct_Number, + CAST(payload.BillingStreet AS VARCHAR(20)) Billing_Street, + CAST(payload.BillingCity AS VARCHAR(20)) Billing_City, + CAST(payload.BillingState AS VARCHAR(10)) Billing_State, + CAST(payload.BillingPostalCode AS VARCHAR(5)) Billing_Post_Code, + CAST(payload.BillingCountry AS VARCHAR(20)) Billing_Country, + CAST(payload.Phone AS VARCHAR(15)) Phone, + CAST(payload.Fax AS VARCHAR(15)) Fax, + CAST(payload.ShippingStreet AS VARCHAR(20)) Shipping_Street, + CAST(payload.ShippingCity AS VARCHAR(20)) Shipping_City, + CAST(payload.ShippingState AS VARCHAR(10)) Shipping_State, + CAST(payload.ShippingPostalCode AS VARCHAR(5)) Shipping_Post_Code, + CAST(payload.ShippingCountry AS VARCHAR(20)) Shipping_Country, + CAST(payload.Industry AS VARCHAR(50)) Industry, + CAST(payload.Description AS VARCHAR(200)) Description, + CAST(payload.NumberOfEmployees AS INT) Num_Of_Employee, + CAST(payload.CustomerPriority__c AS VARCHAR(10)) Priority, + CAST(payload.Rating AS VARCHAR(10)) Rating, + CAST(payload.SLA__c AS VARCHAR(10)) SLA, + CAST(payload."Type" AS VARCHAR(20)) Customer_Type, + CAST(payload.Website AS VARCHAR(100)) Customer_Website, + CAST(payload.AnnualRevenue AS VARCHAR(10)) Annual_Revenue, + CAST(payload.LastActivityDate AS DATE) Last_Activity_Date + FROM salesforce; +``` + +``` sql +SELECT * FROM salesforcePerm; +``` + +Sample results: + +![A picture containing people Description automatically generated](../cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image30.png) + +### Export Vantage Data to Amazon S3 Using NOS + +I have a `newleads` table with 1 row in it on Vantage system. + +![](../cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image41.png) + +Note there's no address information for this lead. Let's use the account information retrieved from Salesforce to update `newleads` table + +``` sql +UPDATE nl +FROM + newleads AS nl, + salesforceReadNOS AS srn +SET + Street = srn.Billing_Street, + City = srn.Billing_City, + State = srn.Billing_State, + Post_Code = srn.Billing_Post_Code, + Country = srn.Billing_Country + WHERE Account_ID = srn.Acct_Number; +``` + +Now the new lead has address information. + +![](../cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image42.png) + +Write the new lead information into S3 bucket using WRITE_NOS. + +``` sql , id="salesforce_first_run", role="content-editable emits-gtm-events" +SELECT * FROM WRITE_NOS ( +ON ( + SELECT + Account_ID, + Last_Name, + First_Name, + Company, + Cust_Title, + Email, + Status, + Owner_ID, + Street, + City, + State, + Post_Code, + Country + FROM newleads +) +USING + LOCATION ('/s3/vantageparquet.s3.amazonaws.com/') + AUTHORIZATION ('{"Access_ID":"A*****","Access_Key":"*****"}') + COMPRESSION ('SNAPPY') + NAMING ('DISCRETE') + INCLUDE_ORDERING ('FALSE') + STOREDAS ('CSV') +) AS d; +``` + +Where Access_ID is the AccessKeyID, and Access_Key is the SecretAccessKey to the bucket. + +### Create an Amazon S3 to Salesforce Flow + +Repeat Step 1 to create a flow using Amazon S3 as source and Salesforce as destination. + +#### Step 1. Specify flow details + +This step provides basic information for your flow. + +Fill in *Flow name* (i.e., _vantage2sf_) and *Flow description (optional)*, leave *Customize encryption settings (advanced)* unchecked. Click *Next*. + +#### Step 2. Configure flow + +This step provides information about the source and destination for your flow. For this example, we will be using *_Amazon S3_* as the source, and *_Salesforce_* as the destination. + +* For *Source details*, choose _Amazon S3_, then choose the bucket where you wrote your CSV file to (i.e. vantagecsv) +* For *Destination details*, choose _Salesforce_, use the connection you created in Step 1 from the drop-down list for *Choose Salesforce connection*, and _Lead_ as *Choose Salesforce object*. +* For *Error handling*, use the default _Stop the current flow run_. +* *Flow trigger* is _Run on demand_. Click *Next*. + +#### Step 3. Map data fields + +This step determines how data is transferred from the source to the destination. + +* Use _Manually map fields_ as *Mapping method* +* Use _Insert new records (default)_ as *Destination record preference* +* For *Source to destination filed mapping*, use the following mapping +![Graphical user interface, application, table Description automatically generated](../cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image43.png) +![](../cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image44.png) + +* Click *Next*. + +#### Step 4. Add filters + +You can specify a filter to determine which records to transfer. For this example, no filter is added. Click *Next*. + +#### Step 5. Review and create + +Review all the information you just entered. Modify if necessary. Click *Create flow*. + +A message of successful flow creation will be displayed with the flow information once the flow is created, + +#### Run flow + +Click *Run flow* on the upper right corner. + +Upon completion of the flow run, message will be displayed to indicate a successful run. + +Message example: + +![](../cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image45.png) + +Browse to the Salesforce page, new lead Tom Johnson has been added. + +![Graphical user interface, application Description automatically generated](../cloud-guides/images/integrate-teradata-vantage-to-salesforce-using-amazon-appflow/image46.png) + +### Cleanup (Optional) + +Once you are done with the Salesforce data, to avoid incurring charges to your AWS account (i.e., [AppFlow](https://aws.amazon.com/appflow/pricing/), Amazon [S3](https://aws.amazon.com/s3/pricing), [Vantage](https://www.teradata.com/Cloud/AWS/Do-it-Yourself/Pricing) and [VM](https://aws.amazon.com/ec2/pricing/)) for the resources used, follow these steps: + +1. AppFlow: + * Delete the "Connections" you created for the flow + * Delete the flows + +2. Amazon S3 bucket and file: + * Go to the Amazon S3 buckets where the Vantage data file is stored, and delete the file(s) + * If there are no need to keep the buckets, delete the buckets + +3. Teradata Vantage Instance + * Stop/Terminate the instance if no longer needed + +import CommunityLinkPartial from '../_partials/community_link.mdx'; + + diff --git a/quickstarts/manage-data/integrate-teradata-vantage-with-google-cloud-data-catalog.md b/quickstarts/manage-data/integrate-teradata-vantage-with-google-cloud-data-catalog.md new file mode 100644 index 0000000000..391c0ad929 --- /dev/null +++ b/quickstarts/manage-data/integrate-teradata-vantage-with-google-cloud-data-catalog.md @@ -0,0 +1,235 @@ +--- +sidebar_position: 17 +author: Wenjie Tehan +email: wenjie.tehan@teradata.com +page_last_update: February 14th, 2022 +description: Integrate Teradata Vantage with Google Cloud Data Catalog. +keywords: [data warehouses, compute storage separation, teradata, vantage, cloud data platform, object storage, business intelligence, enterprise analytics, google cloud, google cloud data catalog.] +--- + +# Integrate Teradata Vantage with Google Cloud Data Catalog + +### Overview + +This article describes the process to connect Teradata Vantage with Google Cloud Data Catalog using the [Data Catalog Teradata Connector on GitHub](https://github.com/GoogleCloudPlatform/datacatalog-connectors-rdbms/tree/master/google-datacatalog-teradata-connector), and then explore the metadata of the Vantage tables via Data Catalog. + +![Diagram Description automatically generated](../cloud-guides/images/integrate-teradata-vantage-with-google-cloud-data-catalog/image2.png) + +* Scrape: Connect to Teradata Vantage and retrieve all the available metadata +* Prepare: Transform metadata in Data Catalog entities and create Tags +* Ingest: Send the Data Catalog entities to the Google Cloud project + +### About Google Cloud Data Catalog + +Google Cloud Data Catalog is a fully managed data discovery and metadata management service. Data Catalog can catalog the native metadata on data assets. Data Catalog is serverless, and provides a central catalog to capture both technical metadata as well as business metadata in a structured format. + +### About Teradata Vantage + +Vantage is the modern cloud platform that unifies data warehouses, data lakes, and analytics into a single connected ecosystem. + +Vantage combines descriptive, predictive, prescriptive analytics, autonomous decision-making, ML functions, and visualization tools into a unified, integrated platform that uncovers real-time business intelligence at scale, no matter where the data resides. + +Vantage enables companies to start small and elastically scale compute or storage, paying only for what they use, harnessing low-cost object stores and integrating their analytic workloads. + +Vantage supports R, Python, Teradata Studio, and any other SQL-based tools. You can deploy Vantage across public clouds, on-premises, on optimized or commodity infrastructure, or as-a-service. + +See the [documentation](https://docs.teradata.com/home) for more information on Teradata Vantage. + +### Prerequisites + +* Access to a Teradata Vantage instance. + +* A **[Google Service Account](https://support.google.com/accounts/answer/27441?hl=en) **with Data Catalog Admin role +* A [**Cloud Console Project created**](https://cloud.google.com/resource-manager/docs/creating-managing-projects) for your account (i.e. partner-integration-lab) +* Billing enabled +* Google Cloud SDK [installed](https://cloud.google.com/sdk/docs/install) and [initialized](https://cloud.google.com/sdk/docs/initializing) +* [Python](https://www.python.org/downloads/) installed +* [Pip](https://pip.pypa.io/en/stable/installation/) installed + +### Procedure + +1. Enable Data Catalog APIs +2. Install Teradata Data Catalog Connector +3. Run +4. Explore Teradata Vantage metadata with Data Catalog + +### Enable Data Catalog API + +* Logon to [Google](http://console.cloud.google.com/) console, choose **APIs & Services** from the Navigation menu, then click on _Library_. Make sure your project is selected on the top menu bar. + + ![Graphical user interface](../cloud-guides/images/integrate-teradata-vantage-with-google-cloud-data-catalog/image3.png) +* Put _Data Catalog_ in the search box and click on **Google Cloud Data Catalog API**, click **ENABLE** + + ![Graphical user interface](../cloud-guides/images/integrate-teradata-vantage-with-google-cloud-data-catalog/image4.png) + +### Install Teradata Data Catalog Connector + +A Teradata Data Catalog connector is available on [GitHub](https://github.com/GoogleCloudPlatform/datacatalog-connectors-rdbms/tree/master/google-datacatalog-teradata-connector). This connector is written in Python. + +* Run following command to authorize gcloud to access the Cloud Platform with Google user credentials. + + ``` bash - + gcloud auth login + ``` +* Choose your Google account when the Google login page opens up and click _Allow_ on the next page. +* Next, set up default project if you haven’t already done so + + ``` bash + gcloud config set project <project id> + ``` + +### Install virtualenv + +We recommend you install the Teradata Data Catalog Connector in an isolated Python environment. To do so, install [virtualenv](https://virtualenv.pypa.io/en/latest/) first: + +* **Windows** + + Run in Powershell as Administrator: + ``` bash + pip install virtualenv + virtualenv --python python3.6 <your-env> + <your-env>\Scripts\activate + ``` +* **MacOS** + + ``` bash , role="content-editable emits-gtm-events" + pip install virtualenv + virtualenv --python python3.6 <your-env> + source <your-env>/bin/activate + ``` +* **Linux** + + ``` bash , role="content-editable emits-gtm-events" + pip install virtualenv + virtualenv --python python3.6 <your-env> + source <your-env>/bin/activate + ``` + +#### Install Data Catalog Teradata Connector + +* **Windows** + + ``` bash + pip.exe install google-datacatalog-teradata-connector + ``` +* **MacOS** + + ``` bash , role="content-editable emits-gtm-events" + pip install google-datacatalog-teradata-connector + ``` +* **Linux** + + ``` bash , role="content-editable emits-gtm-events" + pip install google-datacatalog-teradata-connector + ``` + +### Set environment variables + +``` bash , id="gcp_env_var", role="content-editable emits-gtm-events" +export GOOGLE_APPLICATION_CREDENTIALS=<google_credentials_file> +export TERADATA2DC_DATACATALOG_PROJECT_ID=<google_cloud_project_id> +export TERADATA2DC_DATACATALOG_LOCATION_ID=<google_cloud_location_id> +export TERADATA2DC_TERADATA_SERVER=<teradata_server> +export TERADATA2DC_TERADATA_USERNAME=<teradata_username> +export TERADATA2DC_TERADATA_PASSWORD=<teradata_password> +``` + +Where `` is the key for your service account (json file). + +### Run + +Execute `google-datacatalog-teradata-connector` command to establish entry point to Vantage database. + +``` bash , id="gcp_data_catalog_first_run" role="content-editable emits-gtm-events" +google-datacatalog-teradata-connector \ + --datacatalog-project-id=$TERADATA2DC_DATACATALOG_PROJECT_ID \ + --datacatalog-location-id=$TERADATA2DC_DATACATALOG_LOCATION_ID \ + --teradata-host=$TERADATA2DC_TERADATA_SERVER \ + --teradata-user=$TERADATA2DC_TERADATA_USERNAME \ + --teradata-pass=$TERADATA2DC_TERADATA_PASSWORD +``` + +Sample output from the google-datacatalog-teradata-connector command: + +``` +INFO:root: +==============Starting CLI=============== +INFO:root:This SQL connector does not implement the user defined datacatalog-entry-resource-url-prefix +INFO:root:This SQL connector uses the default entry resoure URL + +============Start teradata-to-datacatalog=========== + +==============Scrape metadata=============== +INFO:root:Scrapping metadata from connection_args + +1 table containers ready to be ingested... + +==============Prepare metadata=============== + +--> database: Gcpuser +37 tables ready to be ingested... + +==============Ingest metadata=============== + +DEBUG:google.auth._default:Checking /Users/Teradata/Apps/Cloud/GCP/teradata2dc-credentials.json for explicit credentials as part of auth process... +INFO:root:Starting to clean up the catalog... +DEBUG:google.auth.transport.requests:Making request: POST https://oauth2.googleapis.com/token +DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): oauth2.googleapis.com:443 +DEBUG:urllib3.connectionpool:https://oauth2.googleapis.com:443 "POST /token HTTP/1.1" 200 None +INFO:root:0 entries that match the search query exist in Data Catalog! +INFO:root:Looking for entries to be deleted... +INFO:root:0 entries will be deleted. + +Starting to ingest custom metadata... + +DEBUG:google.auth._default:Checking /Users/Teradata/Apps/Cloud/GCP/teradata2dc-credentials.json for explicit credentials as part of auth process... +INFO:root:Starting the ingestion flow... +DEBUG:google.auth.transport.requests:Making request: POST https://oauth2.googleapis.com/token +DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): oauth2.googleapis.com:443 +DEBUG:urllib3.connectionpool:https://oauth2.googleapis.com:443 "POST /token HTTP/1.1" 200 None +INFO:root:Tag Template created: projects/partner-integration-lab/locations/us-west1/tagTemplates/teradata_database_metadata +INFO:root:Tag Template created: projects/partner-integration-lab/locations/us-west1/tagTemplates/teradata_table_metadata +INFO:root:Tag Template created: projects/partner-integration-lab/locations/us-west1/tagTemplates/teradata_column_metadata +INFO:root:Entry Group created: projects/partner-integration-lab/locations/us-west1/entryGroups/teradata +INFO:root:1/38 +INFO:root:Entry does not exist: projects/partner-integration-lab/locations/us-west1/entryGroups/teradata/entries/gcpuser +INFO:root:Entry created: projects/partner-integration-lab/locations/us-west1/entryGroups/teradata/entries/gcpuser +INFO:root: ^ [database] 34.105.107.155/gcpuser +INFO:root:Starting the upsert tags step +INFO:root:Processing Tag from Template: projects/partner-integration-lab/locations/us-west1/tagTemplates/teradata_database_metadata ... +INFO:root:Tag created: projects/partner-integration-lab/locations/us-west1/entryGroups/teradata/entries/gcpuser/tags/CWHNiGQeQmPT +INFO:root:2/38 +INFO:root:Entry does not exist: projects/partner-integration-lab/locations/us-west1/entryGroups/teradata/entries/gcpuser_Categories +INFO:root:Entry created: projects/partner-integration-lab/locations/us-west1/entryGroups/teradata/entries/gcpuser_Categories +INFO:root: ^ [table] 34.105.107.155/gcpuser/Categories +INFO:root:Starting the upsert tags step +INFO:root:Processing Tag from Template: projects/partner-integration-lab/locations/us-west1/tagTemplates/teradata_table_metadata ... +INFO:root:Tag created: projects/partner-integration-lab/locations/us-west1/entryGroups/teradata/entries/gcpuser_Categories/tags/Ceij5G9t915o +INFO:root:38/38 +INFO:root:Entry does not exist: projects/partner-integration-lab/locations/us-west1/entryGroups/teradata/entries/gcpuser_tablesv_instantiated_latest +INFO:root:Entry created: projects/partner-integration-lab/locations/us-west1/entryGroups/teradata/entries/gcpuser_tablesv_instantiated_latest +INFO:root: ^ [table] 34.105.107.155/gcpuser/tablesv_instantiated_latest +INFO:root:Starting the upsert tags step +INFO:root:Processing Tag from Template: projects/partner-integration-lab/locations/us-west1/tagTemplates/teradata_table_metadata ... +INFO:root:Tag created: projects/partner-integration-lab/locations/us-west1/entryGroups/teradata/entries/gcpuser_tablesv_instantiated_latest/tags/Ceij5G9t915o +INFO:root: +============End teradata-to-datacatalog============ +``` + +### Explore Teradata Vantage metadata with Data Catalog + +* Go to [Data Catalog](https://console.cloud.google.com/datacatalog) console, click on the project (i.e. partner-integration-lab) under **Projects**. The Teradata tables are showing on the right panel. + + ![Graphical user interface](../cloud-guides/images/integrate-teradata-vantage-with-google-cloud-data-catalog/image5.png) +* Click on the table to your interest (i.e. CITY_LEVEL_TRANS), and you’ll see the metadata about this table: + + ![Graphical user interface](../cloud-guides/images/integrate-teradata-vantage-with-google-cloud-data-catalog/image6.png) + +### Cleanup (optional) + +* Clean up metadata from Data Catalog. To do that, copy https://github.com/GoogleCloudPlatform/datacatalog-connectors-rdbms/blob/master/google-datacatalog-teradata-connector/tools/cleanup_datacatalog.py to local directory. +* Change directory to where the file is and then run following command: + + ``` bash , role="content-editable emits-gtm-events" + python cleanup_datacatalog.py --datacatalog-project-ids=$TERADATA2DC_DATACATALOG_PROJECT_ID + ``` diff --git a/quickstarts/manage-data/nos.md b/quickstarts/manage-data/nos.md new file mode 100644 index 0000000000..14d28c627c --- /dev/null +++ b/quickstarts/manage-data/nos.md @@ -0,0 +1,250 @@ +--- +id: nos +sidebar_position: 1 +author: Adam Tworkiewicz +email: adam.tworkiewicz@teradata.com +page_last_update: September 7th, 2021 +description: Teradata Vantage Native Object Storage - read and write from/to object storage, unified SQL interface for Vantage and object storage. +keywords: [data warehouses, compute storage separation, teradata, vantage, cloud data platform, object storage, business intelligence, enterprise analytics] +--- + +import ClearscapeDocsNote from '../_partials/vantage_clearscape_analytics.mdx' +import CommunityLink from '../_partials/community_link.mdx' + +# Query data stored in object storage + +## Overview + +Native Object Storage (NOS) is a Vantage feature that allows you to query data stored in files in object storage such as AWS S3, Google GCS, Azure Blob or on-prem implementations. It's useful in scenarios where you want to explore data without building a data pipeline to bring it into Vantage. + +## Prerequisites + +You need access to a Teradata Vantage instance. NOS is enabled in all Vantage editions from Vantage Express through Developer, DYI to Vantage as a Service starting from version 17.10. + + + +## Explore data with NOS + +:::note +Currently, NOS supports CSV, JSON (as array or new-line delimited), and Parquet data formats. +::: + +Let's say you have a dataset stored as CSV files in an S3 bucket. You want to explore the dataset before you decide if you want to bring it into Vantage. For this scenario, we are going to use a public dataset published by Teradata that contains river flow data collected by the +U.S. Geological Survey. The bucket is at https://td-usgs-public.s3.amazonaws.com/. + +Let's first have a look at sample CSV data. We take the first 10 rows that Vantage will fetch from the bucket: + +``` +SELECT + TOP 10 * +FROM ( + LOCATION='/s3/td-usgs-public.s3.amazonaws.com/CSVDATA/' +) AS d; +``` + +Here is what I've got: + +``` +GageHeight2 Flow site_no datetime Precipitation GageHeight +----------- ----- -------- ---------------- ------------- ----------- +10.9 15300 09380000 2018-06-28 00:30 671 9.80 +10.8 14500 09380000 2018-06-28 01:00 673 9.64 +10.7 14100 09380000 2018-06-28 01:15 672 9.56 +11.0 16200 09380000 2018-06-27 00:00 669 9.97 +10.9 15700 09380000 2018-06-27 00:30 668 9.88 +10.8 15400 09380000 2018-06-27 00:45 672 9.82 +10.8 15100 09380000 2018-06-27 01:00 672 9.77 +10.8 14700 09380000 2018-06-27 01:15 672 9.68 +10.9 16000 09380000 2018-06-27 00:15 668 9.93 +10.8 14900 09380000 2018-06-28 00:45 672 9.72 +``` + +We have got plenty of numbers, but what do they mean? To answer this question, we will ask Vantage to detect the schema of the CSV files: + +``` +SELECT + * +FROM ( + LOCATION='/s3/td-usgs-public.s3.amazonaws.com/CSVDATA/' + RETURNTYPE='NOSREAD_SCHEMA' +) AS d; +``` + +Vantage will now fetch a data sample to analyze the schema and return results: + +``` +Name Datatype FileType Location +--------------- ----------------------------------- --------- ------------------------------------------------------------------- +GageHeight2 decimal(3,2) csv /S3/s3.amazonaws.com/td-usgs-public/CSVDATA/09513780/2018/06/27.csv +Flow decimal(3,2) csv /S3/s3.amazonaws.com/td-usgs-public/CSVDATA/09513780/2018/06/27.csv +site_no int csv /S3/s3.amazonaws.com/td-usgs-public/CSVDATA/09513780/2018/06/27.csv +datetime TIMESTAMP(0) FORMAT'Y4-MM-DDBHH:MI' csv /S3/s3.amazonaws.com/td-usgs-public/CSVDATA/09513780/2018/06/27.csv +Precipitation decimal(3,2) csv /S3/s3.amazonaws.com/td-usgs-public/CSVDATA/09513780/2018/06/27.csv +GageHeight decimal(3,2) csv /S3/s3.amazonaws.com/td-usgs-public/CSVDATA/09513780/2018/06/27.csv +``` + + + +We see that the CSV files have 6 columns. For each column, we get the name, the datatype and the file coordinates that were used to infer the schema. + +## Query data with NOS + +Now that we know the schema, we can work with the dataset as if it was a regular SQL table. To prove the point, let's try to do some data aggregation. Let's get an average temperature per site for sites that collect temperatures. + +``` +SELECT + site_no Site_no, AVG(Flow) Avg_Flow +FROM ( + LOCATION='/s3/td-usgs-public.s3.amazonaws.com/CSVDATA/' +) AS d +GROUP BY + site_no +HAVING + Avg_Flow IS NOT NULL; +``` + +Result: + +``` +Site_no Avg_Flow +-------- --------- +09380000 11 +09423560 73 +09424900 93 +09429070 81 +``` + +To register your ad hoc exploratory activity as a permanent source, create it as a foreign table: + +``` +-- If you are running this sample as dbc user you will not have permissions +-- to create a table in dbc database. Instead, create a new database and use +-- the newly create database to create a foreign table. + +CREATE DATABASE Riverflow + AS PERMANENT = 60e6, -- 60MB + SPOOL = 120e6; -- 120MB + +-- change current database to Riverflow +DATABASE Riverflow; + +CREATE FOREIGN TABLE riverflow + USING ( LOCATION('/s3/td-usgs-public.s3.amazonaws.com/CSVDATA/') ); + +SELECT top 10 * FROM riverflow; +``` + +Result: + +``` +Location GageHeight2 Flow site_no datetime Precipitation GageHeight +------------------------------------------------------------------- ----------- ---- ------- ------------------- ------------- ---------- +/S3/s3.amazonaws.com/td-usgs-public/CSVDATA/09429070/2018/07/02.csv null null 9429070 2018-07-02 14:40:00 1.21 null +/S3/s3.amazonaws.com/td-usgs-public/CSVDATA/09400815/2018/07/10.csv null 0.00 9400815 2018-07-10 00:30:00 0.00 -0.01 +/S3/s3.amazonaws.com/td-usgs-public/CSVDATA/09400815/2018/07/10.csv null 0.00 9400815 2018-07-10 00:45:00 0.00 -0.01 +/S3/s3.amazonaws.com/td-usgs-public/CSVDATA/09400815/2018/07/10.csv null 0.00 9400815 2018-07-10 01:00:00 0.00 -0.01 +/S3/s3.amazonaws.com/td-usgs-public/CSVDATA/09400815/2018/07/10.csv null 0.00 9400815 2018-07-10 00:15:00 0.00 -0.01 +/S3/s3.amazonaws.com/td-usgs-public/CSVDATA/09429070/2018/07/02.csv null null 9429070 2018-07-02 14:38:00 1.06 null +``` + + +This time, the `SELECT` statement looks like a regular select against an in-database table. If you require subsecond response time when querying the data, there is an easy way to bring the CSV data into Vantage to speed things up. Read on to find out how. + +## Load data from NOS into Vantage + +Querying object storage takes time. What if you decided that the data looks interesting and you want to do some more analysis with a solution that will you quicker answers? The good news is that data returned with NOS can be used as a source for `CREATE TABLE` statements. Assuming you have `CREATE TABLE` privilege, you will be able to run: + +IMPORTANT: This query assumes you created database `Riverflow` and a foreign table called `riverflow` in the previous step. + +``` + +-- This query assumes you created database `Riverflow` +-- and a foreign table called `riverflow` in the previous step. + +CREATE MULTISET TABLE riverflow_native (site_no, Flow, GageHeight, datetime) +AS ( + SELECT site_no, Flow, GageHeight, datetime FROM riverflow +) WITH DATA +NO PRIMARY INDEX; + +SELECT TOP 10 * FROM riverflow_native; +``` + +Result: + +``` +site_no Flow GageHeight datetime +------- ----- ---------- ------------------- +9400815 .00 -.01 2018-07-10 00:30:00 +9400815 .00 -.01 2018-07-10 01:00:00 +9400815 .00 -.01 2018-07-10 01:15:00 +9400815 .00 -.01 2018-07-10 01:30:00 +9400815 .00 -.01 2018-07-10 02:00:00 +9400815 .00 -.01 2018-07-10 02:15:00 +9400815 .00 -.01 2018-07-10 01:45:00 +9400815 .00 -.01 2018-07-10 00:45:00 +9400815 .00 -.01 2018-07-10 00:15:00 +9400815 .00 -.01 2018-07-10 00:00:00 +``` + +This time, the `SELECT` query returned in less than a second. Vantage didn't have to fetch the data from NOS. Instead, it answered using data that was already on its nodes. + +## Access private buckets + +So far, we have used a public bucket. What if you have a private bucket? How do you tell Vantage what credentials it should use? + +It is possible to inline your credentials directly into your query: + +``` +SELECT + TOP 10 * +FROM ( + LOCATION='/s3/td-usgs-public.s3.amazonaws.com/CSVDATA/' + AUTHORIZATION='{"ACCESS_ID":"","ACCESS_KEY":""}' +) AS d; +``` + +Entering these credentials all the time can be tedious and less secure. In Vantage, you can create an authorization object that will serve as a container for your credentials: + +``` +CREATE AUTHORIZATION aws_authorization + USER 'YOUR-ACCESS-KEY-ID' + PASSWORD 'YOUR-SECRET-ACCESS-KEY'; +``` + +You can then reference your authorization object when you create a foreign table: + +``` +CREATE FOREIGN TABLE riverflow +, EXTERNAL SECURITY aws_authorization +USING ( LOCATION('/s3/td-usgs-public.s3.amazonaws.com/CSVDATA/') ); +``` + +## Export data from Vantage to object storage + +So far, we have talked about reading and importing data from object storage. Wouldn't it be nice if we had a way to use SQL to export data from Vantage to object storage? This is exactly what `WRITE_NOS` function is for. Let's say we want to export data from `riverflow_native` table to object storage. You can do so with the following query: + +``` +SELECT * FROM WRITE_NOS ( + ON ( SELECT * FROM riverflow_native ) + PARTITION BY site_no ORDER BY site_no + USING + LOCATION('YOUR-OBJECT-STORE-URI') + AUTHORIZATION(aws_authorization) + STOREDAS('PARQUET') + COMPRESSION('SNAPPY') + NAMING('RANGE') + INCLUDE_ORDERING('TRUE') +) AS d; +``` + +Here, we instruct Vantage to take data from `riverflow_native` and save it in `YOUR-OBJECT-STORE-URI` bucket using `parquet` format. The data will be split into files by `site_no` attribute. The files will be compressed. + +## Summary + +In this quick start we have learned how to read data from object storage using Native Object Storage (NOS) functionality in Vantage. NOS supports reading and importing data stored in CSV, JSON and Parquet formats. NOS can also export data from Vantage to object storage. + +# Further reading +* [Teradata Vantage™ - Native Object Store Getting Started Guide](https://docs.teradata.com/r/2mw8ooFr~xX0EaaGFaDW8A/root) + + \ No newline at end of file diff --git a/quickstarts/manage-data/run-bulkloads-efficiently-with-teradata-parallel-transporter.md b/quickstarts/manage-data/run-bulkloads-efficiently-with-teradata-parallel-transporter.md new file mode 100644 index 0000000000..45888fa99d --- /dev/null +++ b/quickstarts/manage-data/run-bulkloads-efficiently-with-teradata-parallel-transporter.md @@ -0,0 +1,256 @@ +--- +sidebar_position: 11 +author: Adam Tworkiewicz +email: adam.tworkiewicz@teradata.com +page_last_update: April 6th, 2022 +description: Load data into Vantage efficiently using Teradata Parallel Transporter (TPT). +keywords: [data warehouses, compute storage separation, teradata, vantage, cloud data platform, object storage, business intelligence, enterprise analytics, Fastload, Teradata Parallel Transporter, TPT] +id: run-bulkloads-efficiently-with-teradata-parallel-transporter +--- + +import ClearscapeDocsNote from '../_partials/vantage_clearscape_analytics.mdx'; +import Tabs from '../_partials/tabsTPT.mdx'; + +# Run large bulkloads efficiently with Teradata Parallel Transporter (TPT) + +## Overview + +We often have a need to move large volumes of data into Vantage. Teradata offers `Teradata Parallel Transporter (TPT)` utility that can efficiently load large amounts of data into Teradata Vantage. This how-to demonstrates how to use `TPT`. In this scenario, we will load over 300k records, over 40MB of data, in a couple of seconds. + +## Prerequisites + +* Access to a Teradata Vantage instance. + + +* Download Teradata Tools and Utilities (TTU) - supported platforms: [Windows](https://downloads.teradata.com/download/tools/teradata-tools-and-utilities-windows-installation-package), [MacOS](https://downloads.teradata.com/download/tools/teradata-tools-and-utilities-mac-osx-installation-package), [Linux](https://downloads.teradata.com/download/tools/teradata-tools-and-utilities-linux-installation-package-0) (requires registration). + +## Install TTU + + + + +## Get Sample data + +We will be working with the US tax fillings for nonprofit organizations. Nonprofit tax filings are public data. The US Internal Revenue Service publishes them in S3 bucket. Let's grab a summary of filings for 2020: `https://storage.googleapis.com/clearscape_analytics_demo_data/TPT/index_2020.csv`. You can use your browser, `wget` or `curl` to save the file locally. + +## Create a database + +Let's create a database in Vantage. Use your favorite SQL tool to run the following query: + +``` sql +CREATE DATABASE irs +AS PERMANENT = 120e6, -- 120MB + SPOOL = 120e6; -- 120MB +``` + +## Run TPT + +We will now run `TPT`. `TPT` is a command-line tool that can be used to load, extract and update data in Teradata Vantage. These various functions are implemented in so called `operators`. For example, loading data into Vantage is handled by the `Load` operator. The `Load` operator is very efficient in uploading large amounts of data into Vantage. The `Load` operator, in order to be fast, has several restrictions in place. It can only populate empty tables. Inserts to already populated tables are not supported. It doesn't support tables with secondary indices. Also, it won't insert duplicate records, even if a table is a `MULTISET` table. For the full list of restrictions check out [Teradata® TPT Reference - Load Operator - Restrictions and Limitations](https://docs.teradata.com/r/Teradata-Parallel-Transporter-Reference/February-2022/Load-Operator/Usage-Notes/Normalized-Tables/Restrictions-and-Limitations). + +TPT has its own scripting language. The language allows you to prepare the database with arbitrary SQL commands, declare the input source and define how the data should be inserted into Vantage. + +To load the csv data to Vantage, we will define and run a job. The job will prepare the database. It will remove old log and error tables and create the target table. It will then read the file and insert the data into the database. + +* Create a job variable file that will tell TPT how to connect to our Vantage database. Create file `jobvars.txt` and insert the following content. Replace `host` with the host name of your database. For example, if you are using a local Vantage Express instance, use `127.0.0.1`. `username` with the database user name, and `password` with the database password. Note that the preparation step (DDL) and the load step have their own configuration values and that the config values need to be entered twice to configure both the DDL and the load step. + +``` bash , id="tpt_first_config", role="emits-gtm-events" +TargetTdpId = 'host' +TargetUserName = 'username' +TargetUserPassword = 'password' + +FileReaderDirectoryPath = '' +FileReaderFileName = 'index_2020.csv' +FileReaderFormat = 'Delimited' +FileReaderOpenMode = 'Read' +FileReaderTextDelimiter = ',' +FileReaderSkipRows = 1 + +DDLErrorList = '3807' + +LoadLogTable = 'irs.irs_returns_lg' +LoadErrorTable1 = 'irs.irs_returns_et' +LoadErrorTable2 = 'irs.irs_returns_uv' +LoadTargetTable = 'irs.irs_returns' +``` + +* Create a file with the following content and save it as `load.txt`. See comments within the job file to understand its structure. + +``` bash +DEFINE JOB file_load +DESCRIPTION 'Load a Teradata table from a file' +( + /* + Define the schema of the data in the csv file + */ + DEFINE SCHEMA SCHEMA_IRS + ( + in_return_id VARCHAR(19), + in_filing_type VARCHAR(5), + in_ein VARCHAR(19), + in_tax_period VARCHAR(19), + in_sub_date VARCHAR(22), + in_taxpayer_name VARCHAR(100), + in_return_type VARCHAR(5), + in_dln VARCHAR(19), + in_object_id VARCHAR(19) + ); + + /* + In the first step, we are sending statements to remove old tables + and create a new one. + This step replies on configuration stored in `od_IRS` operator + */ + STEP st_Setup_Tables + ( + APPLY + ('DROP TABLE ' || @LoadLogTable || ';'), + ('DROP TABLE ' || @LoadErrorTable1 || ';'), + ('DROP TABLE ' || @LoadErrorTable2 || ';'), + ('DROP TABLE ' || @LoadTargetTable || ';'), + ('CREATE TABLE ' || @LoadTargetTable || ' ( + return_id INT, + filing_type VARCHAR(5) CHARACTER SET LATIN NOT CASESPECIFIC, + ein INT, + tax_period INT, + sub_date VARCHAR(100) CHARACTER SET LATIN NOT CASESPECIFIC, + taxpayer_name VARCHAR(100) CHARACTER SET LATIN NOT CASESPECIFIC, + return_type VARCHAR(5) CHARACTER SET LATIN NOT CASESPECIFIC, + dln BIGINT, + object_id BIGINT + ) + PRIMARY INDEX ( return_id );') + TO OPERATOR ($DDL); + ); + + /* + Finally, in this step we read the data from the file operator + and send it to the load operator. + */ + STEP st_Load_File + ( + APPLY + ('INSERT INTO ' || @LoadTargetTable || ' ( + return_id, + filing_type, + ein, + tax_period, + sub_date, + taxpayer_name, + return_type, + dln, + object_id + ) VALUES ( + :in_return_id, + :in_filing_type, + :in_ein, + :in_tax_period, + :in_sub_date, + :in_taxpayer_name, + :in_return_type, + :in_dln, + :in_object_id + );') + TO OPERATOR ($LOAD) + SELECT * FROM OPERATOR($FILE_READER(SCHEMA_IRS)); + ); +); +``` + +* Run the job: + +``` bash +tbuild -f load.txt -v jobvars.txt -j file_load +``` + +A successful run will return logs that look like this: + +``` bash +Teradata Parallel Transporter Version 17.10.00.10 64-Bit +The global configuration file '/opt/teradata/client/17.10/tbuild/twbcfg.ini' is used. + Log Directory: /opt/teradata/client/17.10/tbuild/logs + Checkpoint Directory: /opt/teradata/client/17.10/tbuild/checkpoint + +Job log: /opt/teradata/client/17.10/tbuild/logs/file_load-4.out +Job id is file_load-4, running on osboxes +Teradata Parallel Transporter SQL DDL Operator Version 17.10.00.10 +od_IRS: private log not specified +od_IRS: connecting sessions +od_IRS: sending SQL requests +od_IRS: TPT10508: RDBMS error 3807: Object 'irs_returns_lg' does not exist. +od_IRS: TPT18046: Error is ignored as requested in ErrorList +od_IRS: TPT10508: RDBMS error 3807: Object 'irs_returns_et' does not exist. +od_IRS: TPT18046: Error is ignored as requested in ErrorList +od_IRS: TPT10508: RDBMS error 3807: Object 'irs_returns_uv' does not exist. +od_IRS: TPT18046: Error is ignored as requested in ErrorList +od_IRS: disconnecting sessions +od_IRS: Total processor time used = '0.013471 Second(s)' +od_IRS: Start : Thu Apr 7 20:56:32 2022 +od_IRS: End : Thu Apr 7 20:56:32 2022 +Job step st_Setup_Tables completed successfully +Teradata Parallel Transporter Load Operator Version 17.10.00.10 +ol_IRS: private log not specified +Teradata Parallel Transporter DataConnector Operator Version 17.10.00.10 +op_IRS[1]: Instance 1 directing private log report to 'dtacop-root-368731-1'. +op_IRS[1]: DataConnector Producer operator Instances: 1 +op_IRS[1]: ECI operator ID: 'op_IRS-368731' +op_IRS[1]: Operator instance 1 processing file 'index_2020.csv'. +ol_IRS: connecting sessions +ol_IRS: preparing target table +ol_IRS: entering Acquisition Phase +ol_IRS: entering Application Phase +ol_IRS: Statistics for Target Table: 'irs.irs_returns' +ol_IRS: Total Rows Sent To RDBMS: 333722 +ol_IRS: Total Rows Applied: 333722 +ol_IRS: Total Rows in Error Table 1: 0 +ol_IRS: Total Rows in Error Table 2: 0 +ol_IRS: Total Duplicate Rows: 0 +op_IRS[1]: Total files processed: 1. +ol_IRS: disconnecting sessions +Job step st_Load_File completed successfully +Job file_load completed successfully +ol_IRS: Performance metrics: +ol_IRS: MB/sec in Acquisition phase: 9.225 +ol_IRS: Elapsed time from start to Acquisition phase: 2 second(s) +ol_IRS: Elapsed time in Acquisition phase: 5 second(s) +ol_IRS: Elapsed time in Application phase: 3 second(s) +ol_IRS: Elapsed time from Application phase to end: < 1 second +ol_IRS: Total processor time used = '0.254337 Second(s)' +ol_IRS: Start : Thu Apr 7 20:56:32 2022 +ol_IRS: End : Thu Apr 7 20:56:42 2022 +Job start: Thu Apr 7 20:56:32 2022 +Job end: Thu Apr 7 20:56:42 2022 +``` + + +## `TPT` vs. NOS + +In our case, the file is in an S3 bucket. That means, that we can use Native Object Storage (NOS) to ingest the data: + +``` sql +-- create an S3-backed foreign table +CREATE FOREIGN TABLE irs_returns_nos + USING ( LOCATION('/s3/s3.amazonaws.com/irs-form-990/index_2020.csv') ); + +-- load the data into a native table +CREATE MULTISET TABLE irs_returns_nos_native + (RETURN_ID, FILING_TYPE, EIN, TAX_PERIOD, SUB_DATE, TAXPAYER_NAME) +AS ( + SELECT RETURN_ID, FILING_TYPE, EIN, TAX_PERIOD, SUB_DATE, TAXPAYER_NAME FROM irs_returns_nos +) WITH DATA +NO PRIMARY INDEX; +``` + +The NOS solution is convenient as it doesn't depend on additional tools. It can be implemented using only SQL. It performs well, especially for Vantage deployments with a high number of AMPs as NOS tasks are delegated to AMPs and run in parallel. Also, splitting the data in object storage into multiple files may further improve performance. + +## Summary + +This how-to demonstrated how to ingest large amounts of data into Vantage. We loaded hundreds of thousands or records into Vantage in a couple of seconds using `TPT`. + +## Further reading +* [Teradata® TPT User Guide](https://docs.teradata.com/r/Teradata-Parallel-Transporter-User-Guide/February-2022) +* [Teradata® TPT Reference](https://docs.teradata.com/r/Teradata-Parallel-Transporter-Reference/February-2022) + + +import CommunityLinkPartial from '../_partials/community_link.mdx'; + + diff --git a/quickstarts/manage-data/segment.md b/quickstarts/manage-data/segment.md new file mode 100644 index 0000000000..1922871bdf --- /dev/null +++ b/quickstarts/manage-data/segment.md @@ -0,0 +1,153 @@ +--- +sidebar_position: 15 +id: segment +author: Adam Tworkiewicz +email: adam.tworkiewicz@teradata.com +page_last_update: January 18th, 2022 +description: Store events from Twilio Segment in Teradata Vantage. +keywords: [data warehouses, compute storage separation, teradata, vantage, cloud data platform, object storage, business intelligence, enterprise analytics, customer data platform, cdp, segment] +--- + +import ClearscapeDocsNote from '../_partials/vantage_clearscape_analytics.mdx' +import CommunityLink from '../_partials/community_link.mdx' +import tabsDBT from '../_partials/tabsDBT.mdx' + +# Store events from Twilio Segment + +## Overview +This solution listens to events from Twilio Segment and writes data to a Teradata Vantage instance. The example uses Google Cloud but it can be translated into any cloud platform. + +## Architecture + +In this solution, Twilio Segment writes raw event data to Google Cloud Pub/Sub. Pub/Sub forwards events to a Cloud Run application. The Cloud Run app writes data to a Teradata Vantage database. It's a serverless solution that doesn't require allocation or management of any VM's. + +![Segment Google Cloud Flow Diagram](../images/segment.flow.diagram.png) + +## Deployment + +### Prerequisites +1. A Google Cloud account. If you don't have an account, you can create one at https://console.cloud.google.com/. +2. `gcloud` installed. See https://cloud.google.com/sdk/docs/install. +3. A Teradata Vantage instance that Google Cloud Run can talk to. + + + +### Build and deploy + +1. Clone the sample repository: +``` +git clone git@github.com:Teradata/segment-integration-tutorial.git +``` + +2. The repo contains `segment.sql` file that sets up the database. the script on your Vantage db using your favorite SQL IDE, [Teradata Studio](https://downloads.teradata.com/download/tools/teradata-studio) or command line tool called `bteq` (download for [Windows](https://downloads.teradata.com/node/7314), [Linux](https://downloads.teradata.com/node/200442), [macOS](https://downloads.teradata.com/node/201214). +The SQL script will create a new database called `Segment` and a set of tables to store Segment events. + +3. Set the default project and region: +``` +gcloud config set project +gcloud config set compute/region +``` + +4. Retrieve the project id and the number. We will need it in subsequent steps: +``` +export PROJECT_ID=$(gcloud config get-value project) + +export PROJECT_NUMBER=$(gcloud projects list \ + --filter="$(gcloud config get-value project)" \ + --format="value(PROJECT_NUMBER)") +``` + +5. Enable required Google Cloud services: +``` +gcloud services enable cloudbuild.googleapis.com containerregistry.googleapis.com run.googleapis.com secretmanager.googleapis.com pubsub.googleapis.com +``` + +6. Build the application: +``` +gcloud builds submit --tag gcr.io/$PROJECT_ID/segment-listener +``` + +7. Define an API key that you will share with Segment. Store the API key in Google Cloud Secret Manager: +``` +gcloud secrets create VANTAGE_USER_SECRET +echo -n 'dbc' > /tmp/vantage_user.txt +gcloud secrets versions add VANTAGE_USER_SECRET --data-file=/tmp/vantage_user.txt + +gcloud secrets create VANTAGE_PASSWORD_SECRET +echo -n 'dbc' > /tmp/vantage_password.txt +gcloud secrets versions add VANTAGE_PASSWORD_SECRET --data-file=/tmp/vantage_password.txt +``` + +8. The application that write Segment data to Vantage will use Cloud Run. We first need to allow Cloud Run to access secrets: +``` +gcloud projects add-iam-policy-binding $PROJECT_ID \ + --member=serviceAccount:$PROJECT_NUMBER-compute@developer.gserviceaccount.com \ + --role=roles/secretmanager.secretAccessor +``` + +9. Deploy the app to Cloud Run (replace `` with the hostname or IP of your Teradata Vantage database). The second export statement saves the service url as we need it for subsequent commands: +``` +gcloud run deploy --image gcr.io/$PROJECT_ID/segment-listener segment-listener \ + --region $(gcloud config get-value compute/region) \ + --update-env-vars VANTAGE_HOST=35.239.251.1 \ + --update-secrets 'VANTAGE_USER=VANTAGE_USER_SECRET:1, VANTAGE_PASSWORD=VANTAGE_PASSWORD_SECRET:1' \ + --no-allow-unauthenticated + +export SERVICE_URL=$(gcloud run services describe segment-listener --platform managed --region $(gcloud config get-value compute/region) --format 'value(status.url)') +``` + +10. Create a Pub/Sub topic that will receive events from Segment: +``` +gcloud pubsub topics create segment-events +``` + +11. Create a service account that will be used by Pub/Sub to invoke the Cloud Run app: +``` +gcloud iam service-accounts create cloud-run-pubsub-invoker \ + --display-name "Cloud Run Pub/Sub Invoker" +``` + +12. Give the service account permission to invoke Cloud Run: +``` +gcloud run services add-iam-policy-binding segment-listener \ + --region $(gcloud config get-value compute/region) \ + --member=serviceAccount:cloud-run-pubsub-invoker@$PROJECT_ID.iam.gserviceaccount.com \ + --role=roles/run.invoker +``` + +13. Allow Pub/Sub to create authentication tokens in your project: +``` +gcloud projects add-iam-policy-binding $PROJECT_ID \ + --member=serviceAccount:service-$PROJECT_NUMBER@gcp-sa-pubsub.iam.gserviceaccount.com \ + --role=roles/iam.serviceAccountTokenCreator +``` + +14. Create a Pub/Sub subscription with the service account: +``` +gcloud pubsub subscriptions create segment-events-cloudrun-subscription --topic projects/$PROJECT_ID/topics/segment-events \ + --push-endpoint=$SERVICE_URL \ + --push-auth-service-account=cloud-run-pubsub-invoker@$PROJECT_ID.iam.gserviceaccount.com \ + --max-retry-delay 600 \ + --min-retry-delay 30 +``` + +15. Allow Segment to publish to your topic. To do that, assign `pubsub@segment-integrations.iam.gserviceaccount.com` role `Pub/Sub Publisher` in your project at https://console.cloud.google.com/cloudpubsub/topic/list. See [Segment manual](https://segment.com/docs/connections/destinations/catalog/google-cloud-pubsub/#authentication) for details. + +16. Configure your Google Cloud Pub/Sub a destination in Segment. Use the full topic `projects//topics/segment-events` and map all Segment event types (using `*` character) to the topic. + +## Try it out + +1. Use Segment's Event Tester functionality to send a sample payload to the topic. Verify that the sample data has been stored in Vantage. + +## Limitations + +* The example shows how to deploy the app in a single region. In many cases, this setup doesn't guarantee enough uptime. The Cloud Run app should be deployed in more than one region behind a Global Load Balancer. + +## Summary + +This how-to demonstrates how to send Segment events to Teradata Vantage. The configuration forwards events from Segment to Google Cloud Pub/Sub and then on to a Cloud Run application. The application writes data to Teradata Vantage. + +## Further reading +* [Segment Pub/Sub destination documentation](https://segment.com/docs/connections/destinations/catalog/google-cloud-pubsub/) + + \ No newline at end of file diff --git a/quickstarts/manage-data/select-the-right-data-ingestion-tools-for-teradata-vantage.md b/quickstarts/manage-data/select-the-right-data-ingestion-tools-for-teradata-vantage.md new file mode 100644 index 0000000000..5caa125034 --- /dev/null +++ b/quickstarts/manage-data/select-the-right-data-ingestion-tools-for-teradata-vantage.md @@ -0,0 +1,82 @@ +--- +id: select-the-right-data-ingestion-tools-for-teradata-vantage +sidebar_position: 2 +author: Krutik Pathak +email: krutik.pathak@teradata.com +page_last_update: February 29th, 2024 +description: Recommendation of data ingestions tools to be used in different use cases for Teradata Vantage +keywords: [data ingestion, teradata, nos, tpt, bteq, querygrid, airbyte, object store, saas, vantage, apache, spark, presto, oracle, Flow] +--- + +# Select the right data ingestion solution for Teradata Vantage + +## Overview + +This article outlines different use cases involving data ingestion. It lists available solutions and recommends the optimal solution for each use case. + +## High-volume ingestion, including streaming +Available solutions: + +* Use [Teradata Parallel Transporter API](https://docs.teradata.com/r/Teradata-Parallel-Transporter-Application-Programming-Interface-Programmer-Guide-17.20) +* Stream data to object storage and then ingest using [Teradata Native Object Store (NOS)](https://docs.teradata.com/r/Enterprise_IntelliFlex_VMware/Teradata-VantageTM-Native-Object-Store-Getting-Started-Guide-17.20/Welcome-to-Native-Object-Store). +* Use the [Teradata Parallel Transporter (TPT)](https://docs.teradata.com/r/Teradata-Parallel-Transporter-User-Guide/June-2022/Introduction-to-Teradata-PT) command line utility. +* Use [Teradata Query Service](https://docs.teradata.com/r/Teradata-Query-Service-Installation-Configuration-and-Usage-Guide-for-Customers/April-2022/Using-the-Query-Service-APIs/Getting-Started) - REST API to execute SQL statements in the database. +* Use Teradata database drivers such as JDBC (Java), teradatasql (Python), Node.js driver, ODBC, .NET Data Provider. + + +Teradata Parallel Transport API is usually the most performant solution which offers high throughput and minimum latency. Use it if you need to ingest tens of thousands of rows per second and if you are comfortable using C language. + +Use the Teradata database drivers when the number of events is in thousands per second. Consider using the Fastload protocol that is available in the most popular drivers e.g. JDBC, Python. + +If you don't want to manage the dependency on the driver libraries, use Query Service. Since Query Service uses the regular driver protocol to communicate to the database, the throughput of this solution is similar to the throughput offered by database drivers such as JDBC. If you are a vendor and are looking to integrate your product with Teradata, please be aware that not all Teradata customers have Query Service enabled in their sites. + +If your solution can accept higher latency, a good option is to stream events to object storage and then read the data using NOS. This solution usually requires the least amount of effort. + +### Ingest data from object storage + +Available solutions: + +* [Flow (VantageCloud Lake only)](https://docs.teradata.com/r/Teradata-VantageCloud-Lake/Loading-Data/Introduction-to-Flow) +* [Teradata Native Object Store (NOS)](https://docs.teradata.com/r/Enterprise_IntelliFlex_VMware/Teradata-VantageTM-Native-Object-Store-Getting-Started-Guide-17.20/Welcome-to-Native-Object-Store) +* [Teradata Parallel Transporter (TPT)](https://docs.teradata.com/r/Teradata-Parallel-Transporter-User-Guide/June-2022/Introduction-to-Teradata-PT) + +Flow is the recommended ingestion mechanism to bring data from object storage to VantageCloud Lake. For all other Teradata Vantage editions, Teradata NOS is the recommended option. NOS can leverage all Teradata nodes to perform ingestion. Teradata Parallel Transporter (TPT) runs on the client side. It can be used when there is no connectivity from NOS to object storage. + +### Ingest data from local files +Available solutions: + +* [Teradata Parallel Transporter (TPT)](https://docs.teradata.com/r/Teradata-Parallel-Transporter-User-Guide/June-2022/Introduction-to-Teradata-PT) +* [BTEQ](https://docs.teradata.com/r/Enterprise_IntelliFlex_Lake_VMware/Basic-Teradata-Query-Reference-17.20/Introduction-to-BTEQ) + +TPT is the recommended option to load data from local files. TPT is optimized for scalability and parallelism, thus it has the best throughput of all available options. BTEQ can be used when an ingestion process requires scripting. It also makes sense to continue using BTEQ if all your other ingestion pipelines run in BTEQ. + +### Ingest data from SaaS applications +Available solutions: + +* Multiple 3rd party tools such as [Airbyte](https://airbyte.com/), [Precog](https://precog.com/), [Nexla](https://nexla.com/), [Fivetran](https://fivetran.com/) +* Export from SaaS apps to local files and then ingest using [Teradata Parallel Transporter (TPT)](https://docs.teradata.com/r/Teradata-Parallel-Transporter-User-Guide/June-2022/Introduction-to-Teradata-PT) +* Export from SaaS apps to object storage and then ingest using [Teradata Native Object Store (NOS)](https://docs.teradata.com/r/Enterprise_IntelliFlex_VMware/Teradata-VantageTM-Native-Object-Store-Getting-Started-Guide-17.20/Welcome-to-Native-Object-Store). + +3rd party tools are usually a better option to move data from SaaS applications to Teradata Vantage. They offer broad support for data sources and eliminate the need to manage intermediate steps such as exporting and storing exported datasets. + +### Use data stored in other databases for unified query processing +Available solutions: + +* [Teradata QueryGrid](https://docs.teradata.com/r/Teradata-QueryGridTM-Installation-and-User-Guide/October-2020/Teradata-QueryGrid-Overview) +* Export from other databases to local files and then ingest using [Teradata Parallel Transporter (TPT)](https://docs.teradata.com/r/Teradata-Parallel-Transporter-User-Guide/June-2022/Introduction-to-Teradata-PT) +* Export from other databases to object storage and then ingest using [Teradata Native Object Store (NOS)](https://docs.teradata.com/r/Enterprise_IntelliFlex_VMware/Teradata-VantageTM-Native-Object-Store-Getting-Started-Guide-17.20/Welcome-to-Native-Object-Store). + +QueryGrid is the recommended option to move limited quantities of data between different systems/platforms. This includes movement within Vantage instances, Apache Spark, Oracle, Presto, etc. It is especially suited to situations when what needs to be synced is described by complex conditions that can be expressed in SQL. + +## Summary +In this article, we explored various data ingestion use cases, provided a list of available tools for each use case, and identified the recommended options for different scenarios. + +## Further Reading + +* [Query data stored in object storage using NOS](https://quickstarts.teradata.com/nos.html) + +* [Run large bulkloads efficiently with Teradata Parallel Transporter](https://quickstarts.teradata.com/tools-and-utilities/run-bulkloads-efficiently-with-teradata-parallel-transporter.html) + +* [Teradata QueryGrid](https://docs.teradata.com/r/Teradata-QueryGridTM-Installation-and-User-Guide/October-2020/Teradata-QueryGrid-Overview) + +* [Use Airbyte to load data from external sources to Teradata Vantage](https://quickstarts.teradata.com/elt/use-airbyte-to-load-data-from-external-sources-to-teradata-vantage.html) diff --git a/quickstarts/manage-data/terraform-airbyte-provider.md b/quickstarts/manage-data/terraform-airbyte-provider.md new file mode 100644 index 0000000000..7871e98c44 --- /dev/null +++ b/quickstarts/manage-data/terraform-airbyte-provider.md @@ -0,0 +1,230 @@ +--- +sidebar_position: 9 +author: Janeth Graziani +email: Janeth.graziani@teradata.com +page_last_update: February 28, 2024 +description: Use Terraform to manage Teradata data pipelines in Airbyte using Terraform. +keywords: [Terraform, Airbyte, Teradata Vantage, data engineering, ELT, automation, data integration, CI/CD, version control] +--- + +import YouTubeVideo from '../_partials/terraform-video.mdx'; + +# Manage ELT pipelines as code with Terraform and Airbyte on Teradata Vantage + + +### Overview + +This quickstart explains how to use Terraform to manage Airbyte data pipelines as code. Instead of manual configurations through the WebUI, we'll use code to create and manage Airbyte resources. The provided example illustrates a basic ELT pipeline from Google Sheets to Teradata Vantage using Airbyte's Terraform provider. + +The Airbyte Terraform provider is available for users on Airbyte Cloud, OSS & Self-Managed Enterprise. + +Watch this concise explanation of how this integration works: + + + +### Introduction +[Terraform](https://www.terraform.io) is a leading open-source tool in the Infrastructure as Code (IaC) space. It enables the automated provisioning and management of infrastructure, cloud platforms, and services via configuration files, instead of manual setup. Terraform uses plugins, known as Terraform providers, to communicate with infrastructure hosts, cloud providers, APIs, and SaaS platforms. + +Airbyte, the data integration platform, has a Terraform provider that communicates directly with [Airbyte's API](https://reference.airbyte.com/reference/start). This allows data engineers to manage Airbyte configurations, enforce version control, and apply good data engineering practices within their ELT pipelines. + +### Prerequisites +* [Airbyte Cloud Account](https://airbyte.com/connectors/teradata-vantage). Start with a 14-day free trial that begins after the first successful sync. +- Generate an Airbyte API Key by logging into the [developer portal](https://portal.airbyte.com). +* Teradata Vantage Instance. You will need a database `Host`, `Username`, and `Password` for Airbyte’s Terraform configuration. +- [Create a free Teradata instance on ClearScape Analytics Experience](https://quickstarts.teradata.com/getting-started-with-csae.html) + +* Source Data. For demonstration purposes we will use a [sample Google Sheets,](https://docs.google.com/spreadsheets/d/1XNBYUw3p7xG6ptfwjChqZ-dNXbTuVwPi7ToQfYKgJIE/edit#gid=0). Make a copy of it into a personal Google worspace. + +* [Google Cloud Platform API enabled for your personal or organizational account](https://support.google.com/googleapi/answer/6158841?hl=en]=). You’ll need to authenticate your Google account via OAuth or via Service Account Key Authenticator. In this example, we use [Service Account Key Authenticator](https://cloud.google.com/iam/docs/keys-create-delete). + +### Install Terraform +* Apply the respective commands to install Terraform on your Operating System. Find additional options on the [Terraform site](https://developer.hashicorp.com/terraform/tutorials/aws-get-started/install-cli). + +[tabs, id="terraform_tab_install", role="emits-gtm-events"] +==== + +macOS:: ++ +First, install the HashiCorp tap, a repository of all [Homebrew](https://brew.sh) packages. ++ +``` bash + brew tap hashicorp/tap +``` ++ +Next, install Terraform with hashicorp/tap/terraform. ++ +``` bash + brew install hashicorp/tap/terraform +``` +Windows:: ++ +[Chocolatey](https://chocolatey.org) is a free and open-source package management system for Windows. Install the Terraform package from the command-line. ++ +``` bash + choco install terraform +``` +Linux:: ++ +``` bash +wget -O- https://apt.releases.hashicorp.com/gpg | sudo gpg --dearmor -o /usr/share/keyrings/hashicorp-archive-keyring.gpg +echo "deb [signed-by=/usr/share/keyrings/hashicorp-archive-keyring.gpg] https://apt.releases.hashicorp.com $(lsb_release -cs) main" | sudo tee /etc/apt/sources.list.d/hashicorp.list +sudo apt update && sudo apt install terraform +``` +==== + +### Environment preparation + +Prepare the environment by creating a directory for the Terraform configuration and initialize two files: `main.tf` and `variables.tf`. + +``` bash +mkdir terraform_airbyte +cd terraform_airbyte +touch main.tf variables.tf +``` + +### Define a data pipeline +Define the data source, destination and connection within the `main.tf` file. Open the newly created `main.tf` file in Visual Studio Code or any preferred code editor. + +- If using Visual Studio Code, install [HashiCorp Terraform Extensions](https://marketplace.visualstudio.com/items?itemName=HashiCorp.terraform) to add autocompletion and syntax highlighting. You can also add [Terraform by Anton Kuliko](https://marketplace.visualstudio.com/items?itemName=4ops.terraform) for configuration language support. + +![Terraform Extensions on Visual Studio Code](../elt/images/terraform-airbyte-provider/extensions.png) + +Populate the main.tf file with the template provided. +``` bash +# Provider Configuration +terraform { + required_providers { + airbyte = { + source = "airbytehq/airbyte" + version = "0.4.1" // Latest Version https://registry.terraform.io/providers/airbytehq/airbyte/latest + } + } +} +provider "airbyte" { + // If running on Airbyte Cloud, generate & save the API key from https://portal.airbyte.com + bearer_auth = var.api_key +} +# Google Sheets Source Configuration +resource "airbyte_source_google_sheets" "my_source_gsheets" { + configuration = { + source_type = "google-sheets" + credentials = { + service_account_key_authentication = { + service_account_info = var.google_private_key + } + } + names_conversion = true, + spreadsheet_id = var.spreadsheet_id + } + name = "Google Sheets" + workspace_id = var.workspace_id +} +# Teradata Vantage Destination Configuration +# For optional parameters visit https://registry.terraform.io/providers/airbytehq/airbyte/latest/docs/resources/destination_teradata +resource "airbyte_destination_teradata" "my_destination_teradata" { + configuration = { + host = var.host + password = var.password + schema = "airbyte_td_two" + ssl = false + ssl_mode = { + allow = {} + } + username = var.username + } + name = "Teradata" + workspace_id = var.workspace_id +} +# Connection Configuration +resource "airbyte_connection" "googlesheets_teradata" { + name = "Google Sheets - Teradata" + source_id = airbyte_source_google_sheets.my_source_gsheets.source_id + destination_id = airbyte_destination_teradata.my_destination_teradata.destination_id + schedule = { + schedule_type = "cron" // "manual" + cron_expression = "0 15 * * * ?" # This sets the data sync to run every 15 minutes of the hour + } + } +``` + +Note that this example uses a cron expression to schedule the data transfer to run every 15 minutes past the hour. + +In our `main.tf` file we reference variables which are held in the `variables.tf` file, including the API key, workspace ID, Google Sheet id, Google private key and Teradata Vantage credentials. Copy the following template into the `variables.tf` file and populate with the appropriate configuration values in the `default` attribute. + +### Configuring the variables.tf file + +``` bash +#log in to https://portal.airbyte.com generate, save and populate the variable with an API key +variable "api_key" { + type = string + default = "" +} +#workspace_id is found in the url to the Airbyte Cloud account https://cloud.airbyte.com/workspaces//settings/dbt-cloud +variable "workspace_id" { + type = string + default = "" +} + +#Google spreadsheet id and Google private key +variable "spreadsheet_id" { + type = string + default = "" +} +variable "google_private_key" { + type = string + default = "" +} +# Teradata Vantage connection credentials +variable "host" { + type = string + default = "" + } +variable "username" { + type = string + default = "demo_user" + } + variable "password" { + type = string + default = "" + } +``` + +### Execution Commands + +Run `terraform init` pull down provider plugin from terraform provider page and initialize a working Terraform directory. + +This command should only be run after writing a new Terraform configuration or cloning an existing one from version control. + +![Initialize Terraform with Terraform init command](../elt/images/terraform-airbyte-provider/terraforminit.png) + +Run `terraform plan` to display the execution plan Terraform will use to create resource and make modifications to infrastructure. + +For this example a plan for 3 new resources is created: + +Connection: # airbyte_connection.googlesheets_teradata will be created + +Destination: # airbyte_connection.googlesheets_teradata will be created + +Source: # airbyte_source_google_sheets.my_source_gsheets will be created + +![View Terraform execution plan with terraform plan command](../elt/images/terraform-airbyte-provider/terraformplan.png) + +Run `terraform apply` and `yes` to generate a plan and carry out the plan. + +![Apply the Terraform plan with terraform apply command](../elt/images/terraform-airbyte-provider/terraformapply.png) + +The `terraform.tfstate` file is created after running `terraform apply` for the first time. This file tracks the status of all sources, destinations, and connections managed by Terraform. For subsequent executions of `Terraform apply`, Terraform compares the code in the `main.tf` file with the code stored in the `tfstate` file. If resources are added or removed in `main.tf`, Terraform automatically updates both deployment and the `.tfstate` file accordingly upon deployment. Do not modify this file by hand. + +You now have a Source, Destination and Connection on Airbyte Cloud created and managed via Terraform. + +![Airbyte Connection in Airbyte Cloud UI](../elt/images/terraform-airbyte-provider/airbyteconnection.png) + +### Additional Resources + +[Use Airbyte to load data from external sources to Teradata Vantage](https://quickstarts.teradata.com/elt/use-airbyte-to-load-data-from-external-sources-to-teradata-vantage.html) + +[Transform data Loaded with Airbyte using dbt](https://quickstarts.teradata.com/elt/transforming-external-data-loaded-via-airbyte-in-teradata-vantage-using-dbt.html) + +[Airbyte API reference documentation](https://reference.airbyte.com/reference/createsource). + +[Terraform Airbyte Provider Docs](https://registry.terraform.io/providers/airbytehq/airbyte/latest/docs/resources/destination_teradata#example-usage) \ No newline at end of file diff --git a/quickstarts/manage-data/transforming-external-data-loaded-via-airbyte-in-teradata-vantage-using-dbt.md b/quickstarts/manage-data/transforming-external-data-loaded-via-airbyte-in-teradata-vantage-using-dbt.md new file mode 100644 index 0000000000..f610a99812 --- /dev/null +++ b/quickstarts/manage-data/transforming-external-data-loaded-via-airbyte-in-teradata-vantage-using-dbt.md @@ -0,0 +1,238 @@ +--- +sidebar_position: 10 +author: Krutik Pathak +email: krutik.pathak@teradata.com +page_last_update: July 27, 2023 +description: This tutorial describes the type of transformations that are needed to transform external data loaded through Airbyte with dbt. +keywords: [dbt, airbyte, data transformation, data warehouses, compute storage separation, teradata, vantage, cloud data platform, object storage, data transfer, data extraction, enterprise analytics, elt.] +--- + +# Transform data Loaded with Airbyte using dbt + +### Overview + +This tutorial demonstrates how to use [dbt (Data Build Tool)](https://docs.getdbt.com/docs/introduction) to transform external data load through [Airbyte](https://github.com/airbytehq/airbyte) (an Open-Source Extract Load tool) in Teradata Vantage. + +This tutorial is based on the original [dbt Jaffle Shop tutorial](https://github.com/dbt-labs/jaffle_shop-dev) with a small change, instead of using the `dbt seed` command, the Jaffle Shop dataset is loaded from Google Sheets into Teradata Vantage using Airbyte. Data loaded through airbyte is contained in JSON columns as can be seen in the picture below: + +![Raw data in Teradata Vantage](../elt/images/getting-started-with-airbyte-dbt/raw_data_vantage_dbeaver.png) + +### Prerequisites + +import ClearscapeDocsNote from '../_partials/vantage_clearscape_analytics.mdx' + +* Access to a Teradata Vantage Instance. + +* Sample data: The sample data [Jaffle Shop Dataset](https://docs.google.com/spreadsheets/d/1-R4F3q8J9KDnFRWpiT3Ysp1RlOoUu3PeQR7xDeLxFts/edit#gid=42273685) can be found in Google Sheets. +* Reference dbt project repository: [Jaffle Project with Airbyte.](https://github.com/Teradata/airbyte-dbt-jaffle) +* Python 3.7, 3.8, 3.9, 3.10 or 3.11 installed. + +### Sample Data Loading +* Follow the steps in the [Airbyte tutorial](https://quickstarts.teradata.com/elt/use-airbyte-to-load-data-from-external-sources-to-teradata-vantage.html). Make sure you load data from the [Jaffle Shop spreadsheet](https://docs.google.com/spreadsheets/d/1-R4F3q8J9KDnFRWpiT3Ysp1RlOoUu3PeQR7xDeLxFts/edit#gid=42273685) and not the default dataset referenced by the Airbyte tutorial. Also, set the `Default Schema` in the Teradata destination to `airbyte_jaffle_shop`. + +:::note +When you configure a Teradata destination in Airbyte, it will ask for a `Default Schema`. Set the `Default Schema` to `airbyte_jaffle_shop`. +::: + +### Clone the project +Clone the tutorial repository and change the directory to the project directory: + +``` bash +git clone https://github.com/Teradata/airbyte-dbt-jaffle +cd airbyte-dbt-jaffle +``` + +### Install dbt +* Create a new python environment to manage dbt and its dependencies. Activate the environment: + +``` bash +python3 -m venv env +source env/bin/activate +``` + + +:::note +You can activate the virtual environment in Windows by executing the corresponding batch file `./myenv/Scripts/activate`. +::: + +* Install `dbt-teradata` module and its dependencies. The core dbt module is included as a dependency so you don't have to install it separately: + +``` bash +pip install dbt-teradata +``` + +### Configure dbt +* Initialize a dbt project. + +``` bash +dbt init +``` + + +The dbt project wizard will ask you for a project name and database management system to use in the project. In this demo, we define the project name as `dbt_airbyte_demo`. Since we are using the dbt-teradata connector, the only database management system available is Teradata. +![Project name prompt](../elt/images/getting-started-with-airbyte-dbt/dbt_init_project_name.png) +![Database name prompt](../elt/images/getting-started-with-airbyte-dbt/dbt_init_database_name.png) + +* Configure the `profiles.yml` file located in the `$HOME/.dbt` directory. If the `profiles.yml` file is not present, you can create a new one. +* Adjust `server`, `username`, `password` to match your Teradata instance's `HOST`, `Username`, `Password` respectively. +* In this configuration, `schema` stands for the database that contains the sample data, in our case that is the default schema that we defined in Airbyte `airbyte_jaffle_shop`. + +``` yaml , id="dbt_first_config", role="emits-gtm-events" +dbt_airbyte_demo: + target: dev + outputs: + dev: + type: teradata + server: + schema: airbyte_jaffle_shop + username: + password: + tmode: ANSI + +``` + +* Once the `profiles.yml` file is ready, we can validate the setup. Go to the dbt project folder and run the command: + +``` bash +dbt debug +``` + +If the debug command returned errors, you likely have an issue with the content of `profiles.yml`. If the setup is correct, you will get message `All checks passed!` +![dbt debug output](../elt/images/getting-started-with-airbyte-dbt/dbt_debug.png) + +### The Jaffle Shop dbt project + +`jaffle_shop` is a fictional restaurant that takes orders online. The data of this business consists of tables for `customers`, `orders` and `payments`that follow the entity relations diagram below: + +![](../images/dbt1.svg) + +The data in the source system is normalized. A dimensional model based on the same data, more suitable for analytics tools, is presented below: + +![](../images/dbt2.svg) + +### dbt transformations + +:::note +The complete dbt project encompassing the transformations detailed below is located at [Jaffle Project with Airbyte](https://github.com/Teradata/airbyte-dbt-jaffle). +::: + +The reference dbt project performs two types of transformations. + +* First, it transforms the raw data (in JSON format), loaded from Google Sheets via Airbyte, into staging views. At this stage the data is normalized. +* Next, it transforms the normalized views into a dimensional model ready for analytics. + +The following diagram shows the transformation steps in Teradata Vantage using dbt: + +![](../images/dita.svg) + + +As in all dbt projects, the folder `models` contains the data models that the project materializes as tables, or views, according to the corresponding configurations at the project, or individual model level. + +The models can be organized into different folders according to their purpose in the organization of the data warehouse/lake. Common folder layouts include a folder for `staging`, a folder for `core`, and a folder for `marts`. This structure can be simplified without affecting the workings of dbt. + +### Staging models +In the original [dbt Jaffle Shop tutorial](https://github.com/dbt-labs/jaffle_shop-dev) the project's data is loaded from csv files located in the `./data` folder through dbt's `seed` command. The `seed` command is commonly used to load data from tables, however, this command is not designed to perform data loading. + +In this demo we are assuming a more typical setup in which a tool designed for data loading, Airbyte, was used to load data into the datawarehouse/lake. +Data loaded through Airbyte though is represented as raw JSON strings. From these raw data we are creating normalized staging views. We perform this task through the following staging models. + +* The `stg_customers` model creates the normalized staging view for `customers` from the `_airbyte_raw_customers` table. +* The `stg_orders` model creates the normalized view for `orders` from the `_airbyte_raw_orders` table +* The `stg_payments` model creates the normalized view for `payments` from the `_airbyte_raw_payments` table. + +:::note +As the method of extracting JSON strings remains consistent across all staging models, we will provide a detailed explanation for the transformations using just one of these models as an example. +::: + +Below an example of transforming raw JSON data into a view through the `stg_orders.sql` model : +``` sql +WITH source AS ( + SELECT * FROM {{ source('airbyte_jaffle_shop', '_airbyte_raw_orders')}} +), + +flattened_json_data AS ( + SELECT + _airbyte_data.JSONExtractValue('$.id') AS order_id, + _airbyte_data.JSONExtractValue('$.user_id') AS customer_id, + _airbyte_data.JSONExtractValue('$.order_date') AS order_date, + _airbyte_data.JSONExtractValue('$.status') AS status + FROM source +) + + +SELECT * FROM flattened_json_data +``` + +* In this model the source is defined as the raw table `_airbyte_raw_orders`. +* This raw table columns contains both metadata, and the actual ingested data. The data column is called `_airbyte_data`. +* This column is of Teradata JSON type. This type supports the method JSONExtractValue for retrieving scalar values from the JSON object. +* In this model we are retrieving each of the attributes of interest and adding meaningful aliases in order to materialize a view. + +### Dimensional models (marts) +Building a Dimensional Model is a two-step process: + +* First, we take the normalized views in `stg_orders`, `stg_customers`, `stg_payments` and build denormalized intermediate join tables `customer_orders`, `order_payments`, `customer_payments`. You will find the definitions of these tables in `./models/marts/core/intermediate`. +* In the second step, we create the `dim_customers` and `fct_orders` models. These constitute the dimensional model tables that we want to expose to our BI tool. You will find the definitions of these tables in `./models/marts/core`. + +### Executing transformations +For executing the transformations defined in the dbt project we run: + +``` bash +dbt run +``` + +You will get the status of each model as given below: + +![dbt run output](../elt/images/getting-started-with-airbyte-dbt/dbt_run.png) + +### Test data +To ensure that the data in the dimensional model is correct, dbt allows us to define and execute tests against the data. + +The tests are defined in `./models/marts/core/schema.yml` and `./models/staging/schema.yml`. Each column can have multiple tests configured under the `tests` key. + +* For example, we expect that `fct_orders.order_id` column will contain unique, non-null values. + +To validate that the data in the produced tables satisfies the test conditions run: + +``` bash +dbt test +``` + +If the data in the models satisfies all the test cases, the result of this command will be as below: + +![dbt test output](../elt/images/getting-started-with-airbyte-dbt/dbt_test.png) + +### Generate documentation + +Our model consists of just a few tables. In a scenario with more sources of data, and a more complex dimensional model, documenting the data lineage and what is the purpose of each of the intermediate models is very important. + +Generating this type of documentation with dbt is very straight forward. + +``` bash +dbt docs generate +``` + +This will produce html files in the `./target` directory. + +You can start your own server to browse the documentation. The following command will start a server and open up a browser tab with the docs' landing page: + +``` bash +dbt docs serve +``` + +#### Lineage graph + +![dbt lineage graph](../elt/images/getting-started-with-airbyte-dbt/dbt_docs_serve.png) + +### Summary + +This tutorial demonstrated how to use dbt to transform raw JSON data loaded through Airbyte into dimensional model in Teradata Vantage. The sample project takes raw JSON data loaded in Teradata Vantage, creates normalized views and finally produces a dimensional data mart. We used dbt to transform JSON into Normalized views and multiple dbt commands to create models (`dbt run`), test the data (`dbt test`), and generate and serve model documentation (`dbt docs generate`, `dbt docs serve`). + + +### Further reading +* [dbt documentation](https://docs.getdbt.com/docs) +* [dbt-teradata plugin documentation](https://github.com/Teradata/dbt-teradata) + +import CommunityLinkPartial from '../_partials/community_link.mdx'; + + diff --git a/quickstarts/manage-data/use-airbyte-to-load-data-from-external-sources-to-teradata-vantage.md b/quickstarts/manage-data/use-airbyte-to-load-data-from-external-sources-to-teradata-vantage.md new file mode 100644 index 0000000000..5ae9ab675a --- /dev/null +++ b/quickstarts/manage-data/use-airbyte-to-load-data-from-external-sources-to-teradata-vantage.md @@ -0,0 +1,207 @@ +--- +sidebar_position: 8 +author: Krutik Pathak +email: krutik.pathak@teradata.com +page_last_update: June 9th, 2023 +description: Use Airbyte with Teradata Vantage. +keywords: [airbyte, data warehouses, compute storage separation, teradata, vantage, cloud data platform, object storage, data transfer, data extraction, enterprise analytics, elt.] + +dir: getting-started-with-airbyte +--- + +# Use Airbyte to load data from external sources to Teradata Vantage + + +### Overview + +This tutorial showcases how to use Airbyte to move data from sources to Teradata Vantage, detailing both the [Airbyte Open Source](https://docs.airbyte.com/using-airbyte/getting-started) and [Airbyte Cloud options](https://airbyte.com). This specific example covers replication from Google Sheets to Teradata Vantage. + +* Source: Google Sheets +* Destination: Teradata Vantage + +![Sample Employees Payrate Google Sheets](../elt/images/getting-started-with-airbyte/sample_employees_payrate_google_sheets.png) + +### Prerequisites + +import ClearscapeDocsNote from '../_partials/vantage_clearscape_analytics.mdx' + +* Access to a Teradata Vantage Instance. This will be defined as the destination of the Airbyte connection. You will need a database `Host`, `Username`, and `Password` for Airbyte’s configuration. + + +* [Google Cloud Platform API enabled for your personal or organizational account](https://support.google.com/googleapi/answer/6158841?hl=en). You’ll need to authenticate your Google account via OAuth or via Service Account Key Authenticator. In this example, we use Service Account Key Authenticator. + +* Data from the source system. In this case, we use a [sample spreadsheet from google sheets](https://docs.google.com/spreadsheets/d/1XNBYUw3p7xG6ptfwjChqZ-dNXbTuVwPi7ToQfYKgJIE/edit). The sample data is a breakdown of payrate by employee type. + +### Airbyte Cloud +* Create an account on [Airbyte Cloud](https://airbyte.com) and skip to the instructions under the [Airbyte Configuration](#airbyte_configuration) section. + +### Airbyte Open Source +* Install Docker Compose to run [Airbyte Open Source](https://github.com/airbytehq/airbyte) locally. Docker Compose comes with Docker Desktop. Please refer to [docker docs](https://docs.docker.com/compose/install) for more details. + +* Clone the Airbyte Open Source repository and go to the airbyte directory. + +``` bash +git clone --depth 1 https://github.com/airbytehq/airbyte.git +cd airbyte +``` + +Make Sure to have Docker Desktop running before running the shell script `run-ab-platform`. + +* Run the shell script `run-ab-platform` as + + +``` bash +./run-ab-platform.sh +``` + + +:::note +You can run the above commands with `git bash` in Windows. Please refer to the [Airbyte Local Deployment](https://docs.airbyte.com/deploying-airbyte/local-deployment) for more details. +::: + +* Log in to the web app http://localhost:8000/ by entering the default credentials found in the `.env` file included in the repository. + + +``` bash +BASIC_AUTH_USERNAME=airbyte +BASIC_AUTH_PASSWORD=password +``` + + +When logging in for the first time, Airbyte will prompt you to provide your email address and specify your preferences for product improvements. Enter your preferences and click on "Get started." + +![Specify Preferences](../elt/images/getting-started-with-airbyte/specify_preferences.png) + +Once Airbyte Open Source is launched you will see a connections dashboard. If you launched Airbyte Open Source for the first time, it would not show any connections. + +### Airbyte Configuration + +### Setting the Source Connection +* You can either click "Create your first connection" or click on the top right corner to initiate the new connection workflow on Airbyte's Connections dashboard. + +![Dashboard to create first connection](../elt/images/getting-started-with-airbyte/create_first_connection.png) + +* Airbyte will ask you for the Source, you can select from an existing source (if you have set it up already) or you can set up a new source, in this case we select `Google Sheets`. + +* For authentication we are using `Service Account Key Authentication` which uses a service account key in JSON format. Toggle from the default `OAuth` to `Service Account Key Authentication`. To authenticate your Google account via Service Account Key Authentication, enter your [Google Cloud service account key](https://cloud.google.com/iam/docs/keys-create-delete#creating_service_account_keys) in JSON format. + +Make sure the Service Account has the Project Viewer permission. If your spreadsheet is viewable by anyone with its link, no further action is needed. If not, [give your Service account access to your spreadsheet](https://www.youtube.com/watch?v=GyomEw5a2NQ). + +* Add the link to the source spreadsheet as `Spreadsheet Link`. + + +![Configuring the source in Airbyte](../elt/images/getting-started-with-airbyte/configuring_source_gsheet_airbyte.png) + +:::note +For more details, please refer [Setting Google Sheets as Source Connector in Airbyte Open Source](https://docs.airbyte.com/integrations/sources/google-sheets/#:~:text=For%20Airbyte%20Open%20Source%3A) +::: + +* Click Set up source, if the configuration is correct, you will get the message `All connection tests passed!` + + +### Setting the Destination Connection +* Assuming you want to create a fresh new connection with `Teradata Vantage`, Select `Teradata Vantage` as the destination type under the "Set up the destination" section. +* Add the `Host`, `User`, and `Password`. These are the same as the `Host`, `Username`, and `Password` respectively, used by your Clearscape Analytics Environment. +* Provide a default schema name appropriate to your specific context. Here we have provided `gsheet_airbyte_td`. + +:::note +If you do not provide a `Default Schema`, you will get an error stating "Connector failed while creating schema". Make sure you provide appropriate name in the `Default Schema`. +::: + + +![Configuring the destination Teradata in Airbyte](../elt/images/getting-started-with-airbyte/configuring_destination_teradata_airbyte.png) + + +* Click Set up destination, if the configuration is correct, you will get the message `All connection tests passed!` + + +:::note +You might get a configuration check failed error. Make sure your Teradata Vantage instance is running properly before making a connection through Airbyte. +::: + +### Configuring Data Sync +A namespace is a group of streams [tables) in a source or destination. A schema in a relational database system is an example of a namespace. In a source, the namespace is the location from where the data is replicated to the destination. In a destination, the namespace is the location where the replicated data is stored in the destination. +For more details please refer to [Airbyte Namespace](https://docs.airbyte.com/understanding-airbyte/namespaces) + +![Namespaces in the destination](../elt/images/getting-started-with-airbyte/namespaces_in_destination.png) + + +In our example the destination is a database, so the namespace is the default schema `gsheet_airbyte_td` we defined when we configured the destination. The stream name is a table that is mirroring the name of the spreadsheet in the source, which is `sample_employee_payrate` in this case. Since we are using the single spreadsheet connector, it only supports one stream [the active spreadsheet). + +Other type of sources and destinations might have a different layout. In this example, Google sheets, as source, does not support a namespace. +In our example, we have used `` as the Namespace of the destination, this is the default namespace assigned by Airbyte based on the `Default Schema` we declared in the destination settings. The database `gsheet_airbyte_td` will be created in our Teradata Vantage Instance. + +:::note +We use the term "schema", as it is the term used by Airbyte. In a Teradata context the term "database" is the equivalent. +::: + +#### Replication Frequency +It shows how often data should sync to destination. You can select every hour, 2 hours, 3 hours etc. In our case we used every 24 hours. + +![Replication Frequency 24 hours](../elt/images/getting-started-with-airbyte/replication_frequency_24hr.png) + +You can also use a Cron expression to specify the time when the sync should run. In the example below, we set the Cron expression to run the sync on every Wednesday at 12:43 PM (US/Pacific) time. + +![Replication Frequency Cron Expression](../elt/images/getting-started-with-airbyte/replication_frequency_cron_expression.png) + +### Data Sync Validation + +Airbyte tracks synchronization attempts in the "Sync History" section of the `Status` tab. + +![Data Sync Summary](../elt/images/getting-started-with-airbyte/data_sync_summary.png) + +Next, you can go to the [ClearScape Analytics Experience](https://clearscape.teradata.com/dashboard) and run a Jupyter notebook, notebooks in ClearScape Analytics Experience are configured to run Teradata SQL queries, to verify if the database `gsheet_airbyte_td`, streams (tables) and complete data is present. + +![Data Sync Validation in Teradata](../elt/images/getting-started-with-airbyte/data_sync_validation_in_teradata.png) + +``` bash +%connect local +``` + +``` bash , id="airbyte_select_query", role="emits-gtm-events" +SELECT DatabaseName, TableName, CreateTimeStamp, LastAlterTimeStamp +FROM DBC.TablesV +WHERE DatabaseName = 'gsheet_airbyte_td' +ORDER BY TableName; +``` + +``` bash +DATABASE gsheet_airbyte_td; +``` + +``` bash +SELECT * FROM _airbyte_raw_sample_employee_payrate; +``` + +The stream [table) name in destination is prefixed with `\_airbyte_raw_` because Normalization and Transformation are not supported for this connection, and [we only have the raw table](https://docs.airbyte.com/understanding-airbyte/namespaces/#:~:text=If%20you%20don%27t%20enable%20basic%20normalization%2C%20you%20will%20only%20receive%20the%20raw%20tables.]. Each stream (table) contains 3 columns: + +1. `_airbyte_ab_id`: a uuid assigned by Airbyte to each event that is processed. The column type in Teradata is `VARCHAR(256)`. + +2. `_airbyte_emitted_at`: a timestamp representing when the event was pulled from the data source. The column type in Teradata is `TIMESTAMP(6)`. + +3. `_airbyte_data`: a json blob representing the event data. The column type in Teradata is `JSON`. + +Here in the `_airbyte_data` column, we see 9 rows, the same as we have in the source Google sheet, and the data is in JSON format which can be transformed further as needed. + +### Close and delete the connection + +* You can close the connection in Airbyte by disabling the connection. This will stop the data sync process. + +![Close Airbyte Connection](../elt/images/getting-started-with-airbyte/close_airbyte_connection.png) + +* You can also delete the connection. + +![Delete Airbyte Connection](../elt/images/getting-started-with-airbyte/delete_airbyte_connection.png) + + +### Summary +This tutorial demonstrated how to extract data from a source system like Google sheets and use the Airbyte ELT tool to load the data into the Teradata Vantage Instance. We saw the end-to-end data flow and complete configuration steps for running Airbyte Open Source locally, and configuring the source and destination connections. We also discussed about the available data sync configurations based on replication frequency. We validated the results in the destination using Cloudscape Analytics Experience and finally we saw the methods to pause and delete the Airbyte connection. + +### Further reading +[Teradata Destination | Airbyte Documentation](https://docs.airbyte.com/integrations/destinations/teradata/?_ga=2.156631291.1502936448.1684794236-1752661382.1684794236) + +[Core Concepts | Airbyte Documentation,](https://docs.airbyte.com/cloud/core-concepts/#connection-sync-modes) + +[Airbyte Community Slack](https://airbyte.com/community) + +[Airbyte Community](https://discuss.airbyte.io) + diff --git a/quickstarts/manage-data/using-feast-feature-store-with-teradata-vantage.md b/quickstarts/manage-data/using-feast-feature-store-with-teradata-vantage.md new file mode 100644 index 0000000000..1c707e790f --- /dev/null +++ b/quickstarts/manage-data/using-feast-feature-store-with-teradata-vantage.md @@ -0,0 +1,255 @@ +--- +sidebar_position: 6 +author: Mohammmad Taha Wahab, Mohammad Harris Mansur and Will Fleury +email: mohammadtaha.wahab@teradata.com , mohammadharris.mansur@teradata.com and will.fleury@teradata.com +page_last_update: January 5th, 2023 +description: Feast Enterprise Feature Store Connector with Teradata +keywords: [data warehouses, analytics, teradata, vantage, time series, business intelligence, enterprise analytics, feature store, Feast, connector] +--- + +# Build a FEAST feature store in Teradata Vantage + +## Introduction + +Feast's connector for Teradata is a complete implementation with support for all features and uses Teradata Vantage as an online and offline store. + +## Prerequisites + +Access to a Teradata Vantage instance. + +import ClearscapeDocsNote from '../_partials/vantage_clearscape_analytics.mdx' + + + + +## Overview +This how-to assumes you know Feast terminology. If you need a refresher check out the official [FEAST documentation](https://docs.feast.dev) + +This document demonstrates how developers can integrate `Teradata's offline and online store` with Feast. Teradata's offline stores allow users to use any underlying data store as their offline feature store. Features can be retrieved from the offline store for model training and can be materialized into the online feature store for use during model inference. + +On the other hand, online stores are used to serve features at low latency. The `materialize` command can be used to load feature values from the data sources (or offline stores) into the online store + +The `feast-teradata` library adds support for Teradata as + +* `OfflineStore` +* `OnlineStore` + +Additionally, using Teradata as the registry (catalog) is already supported via the `registry_type: sql` and included in our examples. This means that everything is located in Teradata. However, depending on the requirements, installation, etc, this can be mixed and matched with other systems as appropriate. + +## Getting Started + +To get started, install the `feast-teradata` library +``` bash , id="feast_pip_install", role="emits-gtm-events" +pip install feast-teradata +``` + +Let's create a simple feast setup with Teradata using the standard drivers' dataset. Note that you cannot use `feast init` as this command only works for templates that are part of the core feast library. We intend on getting this library merged into feast core eventually but for now, you will need to use the following cli command for this specific task. All other `feast` cli commands work as expected. + +``` bash +feast-td init-repo +``` + +This will then prompt you for the required information for the Teradata system and upload the example dataset. Let's assume you used the repo name `demo` when running the above command. You can find the repository files along with a file called `test_workflow.py`. Running this `test_workflow.py` will execute a complete workflow for the feast with Teradata as the Registry, OfflineStore, and OnlineStore. + +``` bash +demo/ + feature_repo/ + driver_repo.py + feature_store.yml + test_workflow.py +``` + +From within the `demo/feature_repo` directory, execute the following feast command to apply (import/update) the repo definition into the registry. You will be able to see the registry metadata tables in the teradata database after running this command. + +``` bash +feast apply +``` + +To see the registry information in the feast UI, run the following command. Note the --registry_ttl_sec is important as by default it polls every 5 seconds. + +``` bash +feast ui --registry_ttl_sec=120 +``` + +## Offline Store Config +``` yaml + +project: +registry: +provider: local +offline_store: + type: feast_teradata.offline.teradata.TeradataOfflineStore + host: + database: + user: + password: + log_mech: + +``` + +## Repo Definition + +Below is an example of definition.py which elaborates how +to set the entity, source connector, and feature view. + +Now to explain the different components: + +* `TeradataSource:` Data Source for features stored in Teradata (Enterprise or Lake) or accessible via a Foreign Table from Teradata (NOS, QueryGrid) + +* `Entity:` A collection of semantically related features + +* `Feature View:` A feature view is a group of feature data from a specific data source. Feature views allow you to consistently define features and their data sources, enabling the reuse of feature groups across a project + + +``` python +driver = Entity(name="driver", join_keys=["driver_id"]) +project_name = yaml.safe_load(open("feature_store.yaml")]("project"] + +driver_stats_source = TeradataSource( + database=yaml.safe_load(open("feature_store.yaml")]("offline_store"]["database"], + table=f"{project_name}_feast_driver_hourly_stats", + timestamp_field="event_timestamp", + created_timestamp_column="created", +) + +driver_stats_fv = FeatureView( + name="driver_hourly_stats", + entities=[driver], + ttl=timedelta(weeks=52 * 10), + schema=[ + Field(name="driver_id", dtype=Int64), + Field(name="conv_rate", dtype=Float32), + Field(name="acc_rate", dtype=Float32), + Field(name="avg_daily_trips", dtype=Int64), + ], + source=driver_stats_source, + tags={"team": "driver_performance"}, +) + +``` + +## Offline Store Usage + +There are two different ways to test your offline store as explained below. But first, there are a few mandatory steps to follow: + +Now, let's batch-read some features for training, using only entities (population) for which we have seen an event in the last `60` days. The predicates (filter) used can be on anything relevant for the entity (population) selection for the given training dataset. The `event_timestamp` is only for example purposes. + +``` python + +from feast import FeatureStore +store = FeatureStore(repo_path="feature_repo") +training_df = store.get_historical_features( + entity_df=f""" + SELECT + driver_id, + event_timestamp + FROM demo_feast_driver_hourly_stats + WHERE event_timestamp BETWEEN (CURRENT_TIMESTAMP - INTERVAL '60' DAY) AND CURRENT_TIMESTAMP + """, + features=[ + "driver_hourly_stats:conv_rate", + "driver_hourly_stats:acc_rate", + "driver_hourly_stats:avg_daily_trips" + ], +).to_df() +print(training_df.head()) +``` + +The `feast-teradata` library allows you to use the complete set of feast APIs and functionality. Please refer to the official feast [quickstart](https://docs.feast.dev/getting-started/quickstart) for more details on the various things you can do. + +## Online Store + +Feast materializes data to online stores for low-latency lookup at model inference time. Typically, key-value stores are used for online stores, however, relational databases can be used for this purpose as well. + +Users can develop their own online stores by creating a class that implements the contract in the OnlineStore class. + +## Online Store Config +``` yaml +project: +registry: +provider: local +offline_store: + type: feast_teradata.offline.teradata.TeradataOfflineStore + host: + database: + user: + password: + log_mech: +``` + +## Online Store Usage +There are a few mandatory steps to follow before we can test the online store: + +The command `materialize_incremental` is used to incrementally materialize features in the online store. If there are no new features to be added, this command will essentially not be doing anything. With feast `materialize_incremental`, the start time is either now — ttl (the ttl that we defined in our feature views) or the time of the most recent materialization. If you’ve materialized features at least once, then subsequent materializations will only fetch features that weren’t present in the store at the time of the previous materializations. + +``` bash +CURRENT_TIME=$(date +'%Y-%m-%dT%H:%M:%S') +feast materialize-incremental $CURRENT_TIME +``` + +Next, while fetching the online features, we have two parameters `features` and `entity_rows`. The `features` parameter is a list and can take any number of features that are present in the `df_feature_view`. The example above shows all 4 features present but these can be less than 4 as well. Secondly, the `entity_rows` parameter is also a list and takes a dictionary of the form `{feature_identifier_column: value_to_be_fetched}`. In our case, the column driver_id is used to uniquely identify the different rows of the entity driver. We are currently fetching values of the features where driver_id is equal to 5. We can also fetch multiple such rows using the format: `[{driver_id: val_1}, {driver_id: val_2}, .., {driver_id: val_n}] [{driver_id: val_1}, {driver_id: val_2}, .., {driver_id: val_n}]` + + +``` python +entity_rows = [ + { + "driver_id": 1001, + }, + { + "driver_id": 1002, + }, + ] +features_to_fetch = [ + "driver_hourly_stats:acc_rate", + "driver_hourly_stats:conv_rate", + "driver_hourly_stats:avg_daily_trips" + ] +returned_features = store.get_online_features( + features=features_to_fetch, + entity_rows=entity_rows, +).to_dict() +for key, value in sorted(returned_features.items()): + print(key, " : ", value) +``` + +## How to set SQL Registry + +Another important thing is the SQL Registry. We first make a path variable that uses the username, password, database name, etc. to make a connection string which it then uses to establish a connection to Teradata's Database. +``` python +path = 'teradatasql://'+ teradata_user +':' + teradata_password + '@'+host + '/?database=' + teradata_database + '&LOGMECH=' + teradata_log_mech +``` + +It will create the following table in your database: + +* Entities (entity_name,project_id,last_updated_timestamp,entity_proto) + +* Data_sources (data_source_name,project_id,last_updated_timestamp,data_source_proto) + +* Feature_views (feature_view_name,project_id,last_updated_timestamp,materialized_intervals,feature_view_proto,user_metadata) + +* Request_feature_views (feature_view_name,project_id,last_updated_timestamp,feature_view_proto,user_metadata) + +* Stream_feature_views (feature_view_name,project_id,last_updated_timestamp,feature_view_proto,user_metadata) + +* managed_infra (infra_name, project_id, last_updated_timestamp, infra_proto) + +* validation_references (validation_reference_name, project_id, last_updated_timestamp, validation_reference_proto) + +* saved_datasets (saved_dataset_name, project_id, last_updated_timestamp, saved_dataset_proto) + +* feature_services (feature_service_name, project_id, last_updated_timestamp, feature_service_proto) + +* on_demand_feature_views (feature_view_name, project_id, last_updated_timestamp, feature_view_proto, user_metadata) + +Additionally, if you want to see a complete (but not real-world), end-to-end example workflow example, see the `demo/test_workflow.py` script. This is used for testing the complete feast functionality. + +An Enterprise Feature Store accelerates the value-gaining process in crucial stages of data analysis. It enhances productivity and reduces the time taken to introduce products in the market. By integrating Teradata with Feast, it enables the use of Teradata's highly efficient parallel processing within a Feature Store, thereby enhancing performance. + +## Further reading + +* [Feast Scalable Registry](https://docs.feast.dev/tutorials/using-scalable-registry) +* [Enabling highly scalable feature store with Teradata Vantage and FEAST](https://medium.com/teradata/enabling-highly-scalable-feature-store-with-teradata-vantage-and-feast-e01008fa8fdb) + +import CommunityLinkPartial from '../_partials/community_link.mdx'; + + diff --git a/quickstarts/modelops/_partials/vantage_clearscape_analytics.md b/quickstarts/modelops/_partials/vantage_clearscape_analytics.md new file mode 100644 index 0000000000..389876ac42 --- /dev/null +++ b/quickstarts/modelops/_partials/vantage_clearscape_analytics.md @@ -0,0 +1,5 @@ +:::note +If you need a test instance of Vantage, you can provision one for free at [https://clearscape.teradata.com](https://clearscape.teradata.com/sign-in?utm_source=dev_portal&utm_medium=quickstart_tutorial&utm_campaign=quickstarts) +::: + +![test](../images/run-vantage/boot-manager-menu.png) \ No newline at end of file diff --git a/quickstarts/modelops/attachments/ModelOps_Data_files_v6.zip b/quickstarts/modelops/attachments/ModelOps_Data_files_v6.zip new file mode 100644 index 0000000000..54843e0455 Binary files /dev/null and b/quickstarts/modelops/attachments/ModelOps_Data_files_v6.zip differ diff --git a/quickstarts/modelops/attachments/ModelOps_Data_files_v7.zip b/quickstarts/modelops/attachments/ModelOps_Data_files_v7.zip new file mode 100644 index 0000000000..93902f2ca4 Binary files /dev/null and b/quickstarts/modelops/attachments/ModelOps_Data_files_v7.zip differ diff --git a/quickstarts/modelops/attachments/ModelOps_Operationalize_v6.ipynb b/quickstarts/modelops/attachments/ModelOps_Operationalize_v6.ipynb new file mode 100755 index 0000000000..0447e1182f --- /dev/null +++ b/quickstarts/modelops/attachments/ModelOps_Operationalize_v6.ipynb @@ -0,0 +1,732 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Overview\n", + "\n", + "Once we have finished experiementation and found a good model, we want to operationalize it. \n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "import sys\n", + "logging.basicConfig(stream=sys.stdout, level=logging.INFO)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Host: tdprd.td.teradata.com\n", + "Username: wf250003\n", + "Password: ········\n" + ] + } + ], + "source": [ + "from teradataml import create_context\n", + "import getpass\n", + "\n", + "host = input(\"Host: \")\n", + "username = input(\"Username: \")\n", + "password = getpass.getpass(\"Password: \")\n", + "val_db = input(\"VAL DB: \")\n", + "byom_db = input(\"BYOM DB: \")\n", + "\n", + "# configure byom/val installation\n", + "configure.val_install_location = val_db\n", + "configure.byom_install_location = byom_db\n", + "\n", + "# by default we assume your are using your user database. change as required\n", + "database = username\n", + "\n", + "create_context(host=host, username=username, password=password, logmech=\"TDNEGO\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define Training Function\n", + "\n", + "The training function takes the following shape\n", + "\n", + "```python\n", + "def train(context: ModelContext, **kwargs):\n", + " aoa_create_context()\n", + " \n", + " # your training code\n", + " \n", + " # save your model\n", + " joblib.dump(model, f\"{context.artifact_output_path}/model.joblib\")\n", + " \n", + " record_training_stats(...)\n", + "```\n", + "\n", + "You can execute this from the CLI or directly within the notebook as shown." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# %%writefile ../model_modules/training.py\n", + "\n", + "from xgboost import XGBClassifier\n", + "from sklearn.preprocessing import MinMaxScaler\n", + "from sklearn.pipeline import Pipeline\n", + "from nyoka import xgboost_to_pmml\n", + "from teradataml import DataFrame\n", + "from aoa import (\n", + " record_training_stats,\n", + " save_plot,\n", + " aoa_create_context,\n", + " ModelContext\n", + ")\n", + "\n", + "import joblib\n", + "\n", + "\n", + "def train(context: ModelContext, **kwargs):\n", + " aoa_create_context()\n", + "\n", + " feature_names = context.dataset_info.feature_names\n", + " target_name = context.dataset_info.target_names[0]\n", + "\n", + " # read training dataset from Teradata and convert to pandas\n", + " train_df = DataFrame.from_query(context.dataset_info.sql)\n", + " train_pdf = train_df.to_pandas(all_rows=True)\n", + "\n", + " # split data into X and y\n", + " X_train = train_pdf[feature_names]\n", + " y_train = train_pdf[target_name]\n", + "\n", + " print(\"Starting training...\")\n", + "\n", + " # fit model to training data\n", + " model = Pipeline([('scaler', MinMaxScaler()),\n", + " ('xgb', XGBClassifier(eta=context.hyperparams[\"eta\"],\n", + " max_depth=context.hyperparams[\"max_depth\"]))])\n", + "\n", + " model.fit(X_train, y_train)\n", + "\n", + " print(\"Finished training\")\n", + "\n", + " # export model artefacts\n", + " joblib.dump(model, f\"{context.artifact_output_path}/model.joblib\")\n", + "\n", + " # we can also save as pmml so it can be used for In-Vantage scoring etc.\n", + " xgboost_to_pmml(pipeline=model, col_names=feature_names, target_name=target_name,\n", + " pmml_f_name=f\"{context.artifact_output_path}/model.pmml\")\n", + "\n", + " print(\"Saved trained model\")\n", + "\n", + " from xgboost import plot_importance\n", + " model[\"xgb\"].get_booster().feature_names = feature_names\n", + " plot_importance(model[\"xgb\"].get_booster(), max_num_features=10)\n", + " save_plot(\"feature_importance.png\", context=context)\n", + "\n", + " feature_importance = model[\"xgb\"].get_booster().get_score(importance_type=\"weight\")\n", + "\n", + " record_training_stats(train_df,\n", + " features=feature_names,\n", + " predictors=[target_name],\n", + " categorical=[target_name],\n", + " importance=feature_importance,\n", + " context=context)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:aoa.util.connections:teradataml context already exists. Skipping create_context.\n", + "Starting training...\n", + "Finished training\n", + "Saved trained model\n", + "INFO:aoa.stats.stats:Computing training dataset statistics\n" + ] + }, + { + "data": { + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from aoa import ModelContext, DatasetInfo\n", + "from teradataml import configure\n", + "\n", + "# Define the ModelContext to test with. The ModelContext is created and managed automatically by ModelOps \n", + "# when it executes your code via CLI / UI. However, for testing in the notebook, you can define as follows\n", + "\n", + "# define the training dataset \n", + "sql = \"\"\"\n", + "SELECT \n", + " F.*, D.hasdiabetes\n", + "FROM PIMA_PATIENT_FEATURES F \n", + "JOIN PIMA_PATIENT_DIAGNOSES D\n", + "ON F.patientid = D.patientid\n", + " WHERE D.patientid MOD 5 <> 0\n", + "\"\"\"\n", + "\n", + "feature_metadata = {\n", + " \"database\": database,\n", + " \"table\": \"aoa_feature_metadata\"\n", + "}\n", + "hyperparams = {\"max_depth\": 5, \"eta\": 0.2}\n", + "\n", + "entity_key = \"PatientId\"\n", + "target_names = [\"HasDiabetes\"]\n", + "feature_names = [\"NumTimesPrg\", \"PlGlcConc\", \"BloodP\", \"SkinThick\", \"TwoHourSerIns\", \"BMI\", \"DiPedFunc\", \"Age\"]\n", + " \n", + "dataset_info = DatasetInfo(sql=sql,\n", + " entity_key=entity_key,\n", + " feature_names=feature_names,\n", + " target_names=target_names,\n", + " feature_metadata=feature_metadata)\n", + "\n", + "\n", + "ctx = ModelContext(hyperparams=hyperparams,\n", + " dataset_info=dataset_info,\n", + " artifact_output_path=\"/tmp\",\n", + " model_version=\"v1\",\n", + " model_table=\"aoa_model_v1\")\n", + "\n", + "train(context=ctx)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define Evaluation Function\n", + "\n", + "The evaluation function takes the following shape\n", + "\n", + "```python\n", + "def evaluate(context: ModelContext, **kwargs):\n", + " aoa_create_context()\n", + "\n", + " # read your model\n", + " model = joblib.load(f\"{context.artifact_input_path}/model.joblib\")\n", + " \n", + " # your evaluation logic\n", + " \n", + " record_evaluation_stats(...)\n", + "```\n", + "\n", + "You can execute this from the CLI or directly within the notebook as shown." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# %%writefile ../model_modules/evaluation.py\n", + "\n", + "from sklearn import metrics\n", + "from teradataml import DataFrame, copy_to_sql\n", + "from aoa import (\n", + " record_evaluation_stats,\n", + " save_plot,\n", + " aoa_create_context,\n", + " ModelContext\n", + ")\n", + "\n", + "import joblib\n", + "import json\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "\n", + "def evaluate(context: ModelContext, **kwargs):\n", + "\n", + " aoa_create_context()\n", + "\n", + " model = joblib.load(f\"{context.artifact_input_path}/model.joblib\")\n", + "\n", + " feature_names = context.dataset_info.feature_names\n", + " target_name = context.dataset_info.target_names[0]\n", + "\n", + " test_df = DataFrame.from_query(context.dataset_info.sql)\n", + " test_pdf = test_df.to_pandas(all_rows=True)\n", + "\n", + " X_test = test_pdf[feature_names]\n", + " y_test = test_pdf[target_name]\n", + "\n", + " print(\"Scoring\")\n", + " y_pred = model.predict(X_test)\n", + "\n", + " y_pred_tdf = pd.DataFrame(y_pred, columns=[target_name])\n", + " y_pred_tdf[\"PatientId\"] = test_pdf[\"PatientId\"].values\n", + "\n", + " evaluation = {\n", + " 'Accuracy': '{:.2f}'.format(metrics.accuracy_score(y_test, y_pred)),\n", + " 'Recall': '{:.2f}'.format(metrics.recall_score(y_test, y_pred)),\n", + " 'Precision': '{:.2f}'.format(metrics.precision_score(y_test, y_pred)),\n", + " 'f1-score': '{:.2f}'.format(metrics.f1_score(y_test, y_pred))\n", + " }\n", + "\n", + " with open(f\"{context.artifact_output_path}/metrics.json\", \"w+\") as f:\n", + " json.dump(evaluation, f)\n", + "\n", + " metrics.plot_confusion_matrix(model, X_test, y_test)\n", + " save_plot('Confusion Matrix', context=context)\n", + "\n", + " metrics.plot_roc_curve(model, X_test, y_test)\n", + " save_plot('ROC Curve', context=context)\n", + "\n", + " # xgboost has its own feature importance plot support but lets use shap as explainability example\n", + " import shap\n", + "\n", + " shap_explainer = shap.TreeExplainer(model['xgb'])\n", + " shap_values = shap_explainer.shap_values(X_test)\n", + "\n", + " shap.summary_plot(shap_values, X_test, feature_names=feature_names,\n", + " show=False, plot_size=(12, 8), plot_type='bar')\n", + " save_plot('SHAP Feature Importance', context=context)\n", + "\n", + " feature_importance = pd.DataFrame(list(zip(feature_names, np.abs(shap_values).mean(0))),\n", + " columns=['col_name', 'feature_importance_vals'])\n", + " feature_importance = feature_importance.set_index(\"col_name\").T.to_dict(orient='records'](0]\n", + "\n", + " predictions_table = \"predictions_tmp\"\n", + " copy_to_sql(df=y_pred_tdf, table_name=predictions_table, index=False, if_exists=\"replace\", temporary=True)\n", + "\n", + " record_evaluation_stats(features_df=test_df,\n", + " predicted_df=DataFrame.from_query(f\"SELECT * FROM {predictions_table}\"),\n", + " importance=feature_importance,\n", + " context=context)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:aoa.util.connections:teradataml context already exists. Skipping create_context.\n", + "Scoring\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + "ntree_limit is deprecated, use `iteration_range` or model slicing instead.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:aoa.stats.stats:Computing evaluation dataset statistics\n" + ] + }, + { + "data": { + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Define the ModelContext to test with. The ModelContext is created and managed automatically by ModelOps \n", + "# when it executes your code via CLI / UI. However, for testing in the notebook, you can define as follows\n", + "\n", + "# define the evaluation dataset \n", + "sql = \"\"\"\n", + "SELECT \n", + " F.*, D.hasdiabetes \n", + "FROM PIMA_PATIENT_FEATURES F \n", + "JOIN PIMA_PATIENT_DIAGNOSES D\n", + "ON F.patientid = D.patientid\n", + " WHERE D.patientid MOD 5 = 0\n", + "\"\"\"\n", + "\n", + "dataset_info = DatasetInfo(sql=sql,\n", + " entity_key=entity_key,\n", + " feature_names=feature_names,\n", + " target_names=target_names,\n", + " feature_metadata=feature_metadata)\n", + "\n", + "ctx = ModelContext(hyperparams=hyperparams,\n", + " dataset_info=dataset_info,\n", + " artifact_output_path=\"/tmp\",\n", + " artifact_input_path=\"/tmp\",\n", + " model_version=\"v1\",\n", + " model_table=\"aoa_model_v1\")\n", + "\n", + "evaluate(context=ctx)\n", + "\n", + "# view evaluation results\n", + "with open(f\"{ctx.artifact_output_path}/metrics.json\") as f:\n", + " print(json.load(f))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define Scoring Function\n", + "\n", + "The scoring function takes the following shape\n", + "\n", + "```python\n", + "def score(context: ModelContext, **kwargs):\n", + " aoa_create_context()\n", + "\n", + " # read your model\n", + " model = joblib.load(f\"{context.artifact_input_path}/model.joblib\")\n", + " \n", + " # your evaluation logic\n", + " \n", + " record_scoring_stats(...)\n", + "```\n", + "\n", + "You can execute this from the CLI or directly within the notebook as shown." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# %%writefile ../model_modules/scoring.py\n", + "\n", + "from teradataml import copy_to_sql, DataFrame\n", + "from aoa import (\n", + " record_scoring_stats,\n", + " aoa_create_context,\n", + " ModelContext\n", + ")\n", + "\n", + "import joblib\n", + "import pandas as pd\n", + "\n", + "\n", + "def score(context: ModelContext, **kwargs):\n", + "\n", + " aoa_create_context()\n", + "\n", + " model = joblib.load(f\"{context.artifact_input_path}/model.joblib\")\n", + "\n", + " feature_names = context.dataset_info.feature_names\n", + " target_name = context.dataset_info.target_names[0]\n", + " entity_key = context.dataset_info.entity_key\n", + "\n", + " features_tdf = DataFrame.from_query(context.dataset_info.sql)\n", + " features_pdf = features_tdf.to_pandas(all_rows=True)\n", + "\n", + " print(\"Scoring\")\n", + " predictions_pdf = model.predict(features_pdf[feature_names])\n", + "\n", + " print(\"Finished Scoring\")\n", + "\n", + " # store the predictions\n", + " predictions_pdf = pd.DataFrame(predictions_pdf, columns=[target_name])\n", + " predictions_pdf[entity_key] = features_pdf.index.values\n", + " # add job_id column so we know which execution this is from if appended to predictions table\n", + " predictions_pdf[\"job_id\"] = context.job_id\n", + "\n", + " # teradataml doesn't match column names on append.. and so to match / use same table schema as for byom predict\n", + " # example (see README.md), we must add empty json_report column and change column order manually (v17.0.0.4)\n", + " # CREATE MULTISET TABLE pima_patient_predictions\n", + " # (\n", + " # job_id VARCHAR(255), -- comes from airflow on job execution\n", + " # PatientId BIGINT, -- entity key as it is in the source data\n", + " # HasDiabetes BIGINT, -- if model automatically extracts target\n", + " # json_report CLOB(1048544000) CHARACTER SET UNICODE -- output of\n", + " # )\n", + " # PRIMARY INDEX ( job_id );\n", + " predictions_pdf[\"json_report\"] = \"\"\n", + " predictions_pdf = predictions_pdf[[\"job_id\", entity_key, target_name, \"json_report\"]]\n", + "\n", + " copy_to_sql(df=predictions_pdf,\n", + " schema_name=context.dataset_info.predictions_database,\n", + " table_name=context.dataset_info.predictions_table,\n", + " index=False,\n", + " if_exists=\"append\")\n", + " \n", + " print(\"Saved predictions in Teradata\")\n", + "\n", + " # calculate stats\n", + " predictions_df = DataFrame.from_query(f\"\"\"\n", + " SELECT \n", + " * \n", + " FROM {context.dataset_info.get_predictions_metadata_fqtn()} \n", + " WHERE job_id = '{context.job_id}'\n", + " \"\"\")\n", + "\n", + " record_scoring_stats(features_df=features_tdf, predicted_df=predictions_df, context=context)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:aoa.util.connections:teradataml context already exists. Skipping create_context.\n", + "Scoring\n", + "Finished Scoring\n", + "Saved predictions in Teradata\n", + "INFO:aoa.stats.stats:Computing scoring dataset statistics\n", + "WARNING:aoa.stats.metrics:Publishing scoring metrics is not enabled\n" + ] + } + ], + "source": [ + "# Define the ModelContext to test with. The ModelContext is created and managed automatically by ModelOps \n", + "# when it executes your code via CLI / UI. However, for testing in the notebook, you can define as follows\n", + "\n", + "# define the scoring dataset \n", + "\n", + "sql = \"\"\"\n", + "SELECT \n", + " F.*\n", + "FROM PIMA_PATIENT_FEATURES F \n", + " WHERE F.patientid MOD 5 = 0\n", + "\"\"\"\n", + "\n", + "# where to store predictions\n", + "predictions = {\n", + " \"database\": database,\n", + " \"table\": \"pima_patient_predictions_tmp\"\n", + "}\n", + "\n", + "import uuid\n", + "job_id=str(uuid.uuid4())\n", + "\n", + "dataset_info = DatasetInfo(sql=sql,\n", + " entity_key=entity_key,\n", + " feature_names=feature_names,\n", + " target_names=target_names,\n", + " feature_metadata=feature_metadata,\n", + " predictions=predictions)\n", + "\n", + "ctx = ModelContext(hyperparams=hyperparams,\n", + " dataset_info=dataset_info,\n", + " artifact_output_path=\"/tmp\",\n", + " artifact_input_path=\"/tmp\",\n", + " model_version=\"v1\",\n", + " model_table=\"aoa_model_v1\",\n", + " job_id=job_id)\n", + "\n", + "score(context=ctx)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "
job_idPatientIdHasDiabetesjson_report
2d16fcf4-78e3-4801-a052-a9c224814b853601
2d16fcf4-78e3-4801-a052-a9c224814b85451
2d16fcf4-78e3-4801-a052-a9c224814b854900
2d16fcf4-78e3-4801-a052-a9c224814b856601
2d16fcf4-78e3-4801-a052-a9c224814b85301
2d16fcf4-78e3-4801-a052-a9c224814b852201
2d16fcf4-78e3-4801-a052-a9c224814b853551
2d16fcf4-78e3-4801-a052-a9c224814b855600
2d16fcf4-78e3-4801-a052-a9c224814b854600
2d16fcf4-78e3-4801-a052-a9c224814b853250
" + ], + "text/plain": [ + " job_id PatientId HasDiabetes json_report\n", + "0 2d16fcf4-78e3-4801-a052-a9c224814b85 360 1 \n", + "1 2d16fcf4-78e3-4801-a052-a9c224814b85 45 1 \n", + "2 2d16fcf4-78e3-4801-a052-a9c224814b85 490 0 \n", + "3 2d16fcf4-78e3-4801-a052-a9c224814b85 660 1 \n", + "4 2d16fcf4-78e3-4801-a052-a9c224814b85 30 1 \n", + "5 2d16fcf4-78e3-4801-a052-a9c224814b85 220 1 \n", + "6 2d16fcf4-78e3-4801-a052-a9c224814b85 355 1 \n", + "7 2d16fcf4-78e3-4801-a052-a9c224814b85 560 0 \n", + "8 2d16fcf4-78e3-4801-a052-a9c224814b85 460 0 \n", + "9 2d16fcf4-78e3-4801-a052-a9c224814b85 325 0 " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "DataFrame.from_query(f\"SELECT * FROM {database}.pima_patient_predictions_tmp\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define Model Metadata\n", + "\n", + "Finally, create the configuration files.\n", + "\n", + "Requirements file with the dependencies and versions\n", + "\n", + "```\n", + "%%writefile ../model_modules/requirements.txt\n", + "xgboost==0.90\n", + "scikit-learn==0.24.2\n", + "shap==0.36.0\n", + "matplotlib==3.3.1\n", + "teradataml==17.0.0.4\n", + "nyoka==4.3.0\n", + "aoa==6.0.0\n", + "```\n", + "\n", + "The hyper parameter configuration (defaults)\n", + "```\n", + "%%writefile ../config.json\n", + "{\n", + " \"hyperParameters\": {\n", + " \"eta\": 0.2,\n", + " \"max_depth\": 6\n", + " }\n", + "}\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:py39]", + "language": "python", + "name": "conda-env-py39-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/quickstarts/modelops/attachments/ModelOps_Operationalize_v7.ipynb b/quickstarts/modelops/attachments/ModelOps_Operationalize_v7.ipynb new file mode 100755 index 0000000000..0447e1182f --- /dev/null +++ b/quickstarts/modelops/attachments/ModelOps_Operationalize_v7.ipynb @@ -0,0 +1,732 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Overview\n", + "\n", + "Once we have finished experiementation and found a good model, we want to operationalize it. \n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "import sys\n", + "logging.basicConfig(stream=sys.stdout, level=logging.INFO)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Host: tdprd.td.teradata.com\n", + "Username: wf250003\n", + "Password: ········\n" + ] + } + ], + "source": [ + "from teradataml import create_context\n", + "import getpass\n", + "\n", + "host = input(\"Host: \")\n", + "username = input(\"Username: \")\n", + "password = getpass.getpass(\"Password: \")\n", + "val_db = input(\"VAL DB: \")\n", + "byom_db = input(\"BYOM DB: \")\n", + "\n", + "# configure byom/val installation\n", + "configure.val_install_location = val_db\n", + "configure.byom_install_location = byom_db\n", + "\n", + "# by default we assume your are using your user database. change as required\n", + "database = username\n", + "\n", + "create_context(host=host, username=username, password=password, logmech=\"TDNEGO\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define Training Function\n", + "\n", + "The training function takes the following shape\n", + "\n", + "```python\n", + "def train(context: ModelContext, **kwargs):\n", + " aoa_create_context()\n", + " \n", + " # your training code\n", + " \n", + " # save your model\n", + " joblib.dump(model, f\"{context.artifact_output_path}/model.joblib\")\n", + " \n", + " record_training_stats(...)\n", + "```\n", + "\n", + "You can execute this from the CLI or directly within the notebook as shown." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# %%writefile ../model_modules/training.py\n", + "\n", + "from xgboost import XGBClassifier\n", + "from sklearn.preprocessing import MinMaxScaler\n", + "from sklearn.pipeline import Pipeline\n", + "from nyoka import xgboost_to_pmml\n", + "from teradataml import DataFrame\n", + "from aoa import (\n", + " record_training_stats,\n", + " save_plot,\n", + " aoa_create_context,\n", + " ModelContext\n", + ")\n", + "\n", + "import joblib\n", + "\n", + "\n", + "def train(context: ModelContext, **kwargs):\n", + " aoa_create_context()\n", + "\n", + " feature_names = context.dataset_info.feature_names\n", + " target_name = context.dataset_info.target_names[0]\n", + "\n", + " # read training dataset from Teradata and convert to pandas\n", + " train_df = DataFrame.from_query(context.dataset_info.sql)\n", + " train_pdf = train_df.to_pandas(all_rows=True)\n", + "\n", + " # split data into X and y\n", + " X_train = train_pdf[feature_names]\n", + " y_train = train_pdf[target_name]\n", + "\n", + " print(\"Starting training...\")\n", + "\n", + " # fit model to training data\n", + " model = Pipeline([('scaler', MinMaxScaler()),\n", + " ('xgb', XGBClassifier(eta=context.hyperparams[\"eta\"],\n", + " max_depth=context.hyperparams[\"max_depth\"]))])\n", + "\n", + " model.fit(X_train, y_train)\n", + "\n", + " print(\"Finished training\")\n", + "\n", + " # export model artefacts\n", + " joblib.dump(model, f\"{context.artifact_output_path}/model.joblib\")\n", + "\n", + " # we can also save as pmml so it can be used for In-Vantage scoring etc.\n", + " xgboost_to_pmml(pipeline=model, col_names=feature_names, target_name=target_name,\n", + " pmml_f_name=f\"{context.artifact_output_path}/model.pmml\")\n", + "\n", + " print(\"Saved trained model\")\n", + "\n", + " from xgboost import plot_importance\n", + " model[\"xgb\"].get_booster().feature_names = feature_names\n", + " plot_importance(model[\"xgb\"].get_booster(), max_num_features=10)\n", + " save_plot(\"feature_importance.png\", context=context)\n", + "\n", + " feature_importance = model[\"xgb\"].get_booster().get_score(importance_type=\"weight\")\n", + "\n", + " record_training_stats(train_df,\n", + " features=feature_names,\n", + " predictors=[target_name],\n", + " categorical=[target_name],\n", + " importance=feature_importance,\n", + " context=context)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:aoa.util.connections:teradataml context already exists. Skipping create_context.\n", + "Starting training...\n", + "Finished training\n", + "Saved trained model\n", + "INFO:aoa.stats.stats:Computing training dataset statistics\n" + ] + }, + { + "data": { + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from aoa import ModelContext, DatasetInfo\n", + "from teradataml import configure\n", + "\n", + "# Define the ModelContext to test with. The ModelContext is created and managed automatically by ModelOps \n", + "# when it executes your code via CLI / UI. However, for testing in the notebook, you can define as follows\n", + "\n", + "# define the training dataset \n", + "sql = \"\"\"\n", + "SELECT \n", + " F.*, D.hasdiabetes\n", + "FROM PIMA_PATIENT_FEATURES F \n", + "JOIN PIMA_PATIENT_DIAGNOSES D\n", + "ON F.patientid = D.patientid\n", + " WHERE D.patientid MOD 5 <> 0\n", + "\"\"\"\n", + "\n", + "feature_metadata = {\n", + " \"database\": database,\n", + " \"table\": \"aoa_feature_metadata\"\n", + "}\n", + "hyperparams = {\"max_depth\": 5, \"eta\": 0.2}\n", + "\n", + "entity_key = \"PatientId\"\n", + "target_names = [\"HasDiabetes\"]\n", + "feature_names = [\"NumTimesPrg\", \"PlGlcConc\", \"BloodP\", \"SkinThick\", \"TwoHourSerIns\", \"BMI\", \"DiPedFunc\", \"Age\"]\n", + " \n", + "dataset_info = DatasetInfo(sql=sql,\n", + " entity_key=entity_key,\n", + " feature_names=feature_names,\n", + " target_names=target_names,\n", + " feature_metadata=feature_metadata)\n", + "\n", + "\n", + "ctx = ModelContext(hyperparams=hyperparams,\n", + " dataset_info=dataset_info,\n", + " artifact_output_path=\"/tmp\",\n", + " model_version=\"v1\",\n", + " model_table=\"aoa_model_v1\")\n", + "\n", + "train(context=ctx)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define Evaluation Function\n", + "\n", + "The evaluation function takes the following shape\n", + "\n", + "```python\n", + "def evaluate(context: ModelContext, **kwargs):\n", + " aoa_create_context()\n", + "\n", + " # read your model\n", + " model = joblib.load(f\"{context.artifact_input_path}/model.joblib\")\n", + " \n", + " # your evaluation logic\n", + " \n", + " record_evaluation_stats(...)\n", + "```\n", + "\n", + "You can execute this from the CLI or directly within the notebook as shown." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# %%writefile ../model_modules/evaluation.py\n", + "\n", + "from sklearn import metrics\n", + "from teradataml import DataFrame, copy_to_sql\n", + "from aoa import (\n", + " record_evaluation_stats,\n", + " save_plot,\n", + " aoa_create_context,\n", + " ModelContext\n", + ")\n", + "\n", + "import joblib\n", + "import json\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "\n", + "def evaluate(context: ModelContext, **kwargs):\n", + "\n", + " aoa_create_context()\n", + "\n", + " model = joblib.load(f\"{context.artifact_input_path}/model.joblib\")\n", + "\n", + " feature_names = context.dataset_info.feature_names\n", + " target_name = context.dataset_info.target_names[0]\n", + "\n", + " test_df = DataFrame.from_query(context.dataset_info.sql)\n", + " test_pdf = test_df.to_pandas(all_rows=True)\n", + "\n", + " X_test = test_pdf[feature_names]\n", + " y_test = test_pdf[target_name]\n", + "\n", + " print(\"Scoring\")\n", + " y_pred = model.predict(X_test)\n", + "\n", + " y_pred_tdf = pd.DataFrame(y_pred, columns=[target_name])\n", + " y_pred_tdf[\"PatientId\"] = test_pdf[\"PatientId\"].values\n", + "\n", + " evaluation = {\n", + " 'Accuracy': '{:.2f}'.format(metrics.accuracy_score(y_test, y_pred)),\n", + " 'Recall': '{:.2f}'.format(metrics.recall_score(y_test, y_pred)),\n", + " 'Precision': '{:.2f}'.format(metrics.precision_score(y_test, y_pred)),\n", + " 'f1-score': '{:.2f}'.format(metrics.f1_score(y_test, y_pred))\n", + " }\n", + "\n", + " with open(f\"{context.artifact_output_path}/metrics.json\", \"w+\") as f:\n", + " json.dump(evaluation, f)\n", + "\n", + " metrics.plot_confusion_matrix(model, X_test, y_test)\n", + " save_plot('Confusion Matrix', context=context)\n", + "\n", + " metrics.plot_roc_curve(model, X_test, y_test)\n", + " save_plot('ROC Curve', context=context)\n", + "\n", + " # xgboost has its own feature importance plot support but lets use shap as explainability example\n", + " import shap\n", + "\n", + " shap_explainer = shap.TreeExplainer(model['xgb'])\n", + " shap_values = shap_explainer.shap_values(X_test)\n", + "\n", + " shap.summary_plot(shap_values, X_test, feature_names=feature_names,\n", + " show=False, plot_size=(12, 8), plot_type='bar')\n", + " save_plot('SHAP Feature Importance', context=context)\n", + "\n", + " feature_importance = pd.DataFrame(list(zip(feature_names, np.abs(shap_values).mean(0))),\n", + " columns=['col_name', 'feature_importance_vals'])\n", + " feature_importance = feature_importance.set_index(\"col_name\").T.to_dict(orient='records'](0]\n", + "\n", + " predictions_table = \"predictions_tmp\"\n", + " copy_to_sql(df=y_pred_tdf, table_name=predictions_table, index=False, if_exists=\"replace\", temporary=True)\n", + "\n", + " record_evaluation_stats(features_df=test_df,\n", + " predicted_df=DataFrame.from_query(f\"SELECT * FROM {predictions_table}\"),\n", + " importance=feature_importance,\n", + " context=context)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:aoa.util.connections:teradataml context already exists. Skipping create_context.\n", + "Scoring\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + "ntree_limit is deprecated, use `iteration_range` or model slicing instead.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:aoa.stats.stats:Computing evaluation dataset statistics\n" + ] + }, + { + "data": { + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Define the ModelContext to test with. The ModelContext is created and managed automatically by ModelOps \n", + "# when it executes your code via CLI / UI. However, for testing in the notebook, you can define as follows\n", + "\n", + "# define the evaluation dataset \n", + "sql = \"\"\"\n", + "SELECT \n", + " F.*, D.hasdiabetes \n", + "FROM PIMA_PATIENT_FEATURES F \n", + "JOIN PIMA_PATIENT_DIAGNOSES D\n", + "ON F.patientid = D.patientid\n", + " WHERE D.patientid MOD 5 = 0\n", + "\"\"\"\n", + "\n", + "dataset_info = DatasetInfo(sql=sql,\n", + " entity_key=entity_key,\n", + " feature_names=feature_names,\n", + " target_names=target_names,\n", + " feature_metadata=feature_metadata)\n", + "\n", + "ctx = ModelContext(hyperparams=hyperparams,\n", + " dataset_info=dataset_info,\n", + " artifact_output_path=\"/tmp\",\n", + " artifact_input_path=\"/tmp\",\n", + " model_version=\"v1\",\n", + " model_table=\"aoa_model_v1\")\n", + "\n", + "evaluate(context=ctx)\n", + "\n", + "# view evaluation results\n", + "with open(f\"{ctx.artifact_output_path}/metrics.json\") as f:\n", + " print(json.load(f))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define Scoring Function\n", + "\n", + "The scoring function takes the following shape\n", + "\n", + "```python\n", + "def score(context: ModelContext, **kwargs):\n", + " aoa_create_context()\n", + "\n", + " # read your model\n", + " model = joblib.load(f\"{context.artifact_input_path}/model.joblib\")\n", + " \n", + " # your evaluation logic\n", + " \n", + " record_scoring_stats(...)\n", + "```\n", + "\n", + "You can execute this from the CLI or directly within the notebook as shown." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# %%writefile ../model_modules/scoring.py\n", + "\n", + "from teradataml import copy_to_sql, DataFrame\n", + "from aoa import (\n", + " record_scoring_stats,\n", + " aoa_create_context,\n", + " ModelContext\n", + ")\n", + "\n", + "import joblib\n", + "import pandas as pd\n", + "\n", + "\n", + "def score(context: ModelContext, **kwargs):\n", + "\n", + " aoa_create_context()\n", + "\n", + " model = joblib.load(f\"{context.artifact_input_path}/model.joblib\")\n", + "\n", + " feature_names = context.dataset_info.feature_names\n", + " target_name = context.dataset_info.target_names[0]\n", + " entity_key = context.dataset_info.entity_key\n", + "\n", + " features_tdf = DataFrame.from_query(context.dataset_info.sql)\n", + " features_pdf = features_tdf.to_pandas(all_rows=True)\n", + "\n", + " print(\"Scoring\")\n", + " predictions_pdf = model.predict(features_pdf[feature_names])\n", + "\n", + " print(\"Finished Scoring\")\n", + "\n", + " # store the predictions\n", + " predictions_pdf = pd.DataFrame(predictions_pdf, columns=[target_name])\n", + " predictions_pdf[entity_key] = features_pdf.index.values\n", + " # add job_id column so we know which execution this is from if appended to predictions table\n", + " predictions_pdf[\"job_id\"] = context.job_id\n", + "\n", + " # teradataml doesn't match column names on append.. and so to match / use same table schema as for byom predict\n", + " # example (see README.md), we must add empty json_report column and change column order manually (v17.0.0.4)\n", + " # CREATE MULTISET TABLE pima_patient_predictions\n", + " # (\n", + " # job_id VARCHAR(255), -- comes from airflow on job execution\n", + " # PatientId BIGINT, -- entity key as it is in the source data\n", + " # HasDiabetes BIGINT, -- if model automatically extracts target\n", + " # json_report CLOB(1048544000) CHARACTER SET UNICODE -- output of\n", + " # )\n", + " # PRIMARY INDEX ( job_id );\n", + " predictions_pdf[\"json_report\"] = \"\"\n", + " predictions_pdf = predictions_pdf[[\"job_id\", entity_key, target_name, \"json_report\"]]\n", + "\n", + " copy_to_sql(df=predictions_pdf,\n", + " schema_name=context.dataset_info.predictions_database,\n", + " table_name=context.dataset_info.predictions_table,\n", + " index=False,\n", + " if_exists=\"append\")\n", + " \n", + " print(\"Saved predictions in Teradata\")\n", + "\n", + " # calculate stats\n", + " predictions_df = DataFrame.from_query(f\"\"\"\n", + " SELECT \n", + " * \n", + " FROM {context.dataset_info.get_predictions_metadata_fqtn()} \n", + " WHERE job_id = '{context.job_id}'\n", + " \"\"\")\n", + "\n", + " record_scoring_stats(features_df=features_tdf, predicted_df=predictions_df, context=context)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:aoa.util.connections:teradataml context already exists. Skipping create_context.\n", + "Scoring\n", + "Finished Scoring\n", + "Saved predictions in Teradata\n", + "INFO:aoa.stats.stats:Computing scoring dataset statistics\n", + "WARNING:aoa.stats.metrics:Publishing scoring metrics is not enabled\n" + ] + } + ], + "source": [ + "# Define the ModelContext to test with. The ModelContext is created and managed automatically by ModelOps \n", + "# when it executes your code via CLI / UI. However, for testing in the notebook, you can define as follows\n", + "\n", + "# define the scoring dataset \n", + "\n", + "sql = \"\"\"\n", + "SELECT \n", + " F.*\n", + "FROM PIMA_PATIENT_FEATURES F \n", + " WHERE F.patientid MOD 5 = 0\n", + "\"\"\"\n", + "\n", + "# where to store predictions\n", + "predictions = {\n", + " \"database\": database,\n", + " \"table\": \"pima_patient_predictions_tmp\"\n", + "}\n", + "\n", + "import uuid\n", + "job_id=str(uuid.uuid4())\n", + "\n", + "dataset_info = DatasetInfo(sql=sql,\n", + " entity_key=entity_key,\n", + " feature_names=feature_names,\n", + " target_names=target_names,\n", + " feature_metadata=feature_metadata,\n", + " predictions=predictions)\n", + "\n", + "ctx = ModelContext(hyperparams=hyperparams,\n", + " dataset_info=dataset_info,\n", + " artifact_output_path=\"/tmp\",\n", + " artifact_input_path=\"/tmp\",\n", + " model_version=\"v1\",\n", + " model_table=\"aoa_model_v1\",\n", + " job_id=job_id)\n", + "\n", + "score(context=ctx)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "
job_idPatientIdHasDiabetesjson_report
2d16fcf4-78e3-4801-a052-a9c224814b853601
2d16fcf4-78e3-4801-a052-a9c224814b85451
2d16fcf4-78e3-4801-a052-a9c224814b854900
2d16fcf4-78e3-4801-a052-a9c224814b856601
2d16fcf4-78e3-4801-a052-a9c224814b85301
2d16fcf4-78e3-4801-a052-a9c224814b852201
2d16fcf4-78e3-4801-a052-a9c224814b853551
2d16fcf4-78e3-4801-a052-a9c224814b855600
2d16fcf4-78e3-4801-a052-a9c224814b854600
2d16fcf4-78e3-4801-a052-a9c224814b853250
" + ], + "text/plain": [ + " job_id PatientId HasDiabetes json_report\n", + "0 2d16fcf4-78e3-4801-a052-a9c224814b85 360 1 \n", + "1 2d16fcf4-78e3-4801-a052-a9c224814b85 45 1 \n", + "2 2d16fcf4-78e3-4801-a052-a9c224814b85 490 0 \n", + "3 2d16fcf4-78e3-4801-a052-a9c224814b85 660 1 \n", + "4 2d16fcf4-78e3-4801-a052-a9c224814b85 30 1 \n", + "5 2d16fcf4-78e3-4801-a052-a9c224814b85 220 1 \n", + "6 2d16fcf4-78e3-4801-a052-a9c224814b85 355 1 \n", + "7 2d16fcf4-78e3-4801-a052-a9c224814b85 560 0 \n", + "8 2d16fcf4-78e3-4801-a052-a9c224814b85 460 0 \n", + "9 2d16fcf4-78e3-4801-a052-a9c224814b85 325 0 " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "DataFrame.from_query(f\"SELECT * FROM {database}.pima_patient_predictions_tmp\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define Model Metadata\n", + "\n", + "Finally, create the configuration files.\n", + "\n", + "Requirements file with the dependencies and versions\n", + "\n", + "```\n", + "%%writefile ../model_modules/requirements.txt\n", + "xgboost==0.90\n", + "scikit-learn==0.24.2\n", + "shap==0.36.0\n", + "matplotlib==3.3.1\n", + "teradataml==17.0.0.4\n", + "nyoka==4.3.0\n", + "aoa==6.0.0\n", + "```\n", + "\n", + "The hyper parameter configuration (defaults)\n", + "```\n", + "%%writefile ../config.json\n", + "{\n", + " \"hyperParameters\": {\n", + " \"eta\": 0.2,\n", + " \"max_depth\": 6\n", + " }\n", + "}\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:py39]", + "language": "python", + "name": "conda-env-py39-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/quickstarts/modelops/attachments/ModelOps_Quickstart_BYOM.zip b/quickstarts/modelops/attachments/ModelOps_Quickstart_BYOM.zip new file mode 100644 index 0000000000..c0fe76c0d0 Binary files /dev/null and b/quickstarts/modelops/attachments/ModelOps_Quickstart_BYOM.zip differ diff --git a/quickstarts/modelops/attachments/ModelOps_Training_v6.ipynb b/quickstarts/modelops/attachments/ModelOps_Training_v6.ipynb new file mode 100755 index 0000000000..fb62f9d9cf --- /dev/null +++ b/quickstarts/modelops/attachments/ModelOps_Training_v6.ipynb @@ -0,0 +1,467 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "f6008b6e", + "metadata": {}, + "source": [ + "## Setup\n", + "\n", + "\n", + "Ensure you have the following packages and python libraries installed \n", + "\n", + "```code\n", + "pip install teradataml==17.0.0.4 aoa==6.1.0 pandas==1.1.5\n", + "```\n", + "\n", + "The remainder of the notebook runs through the following steps\n", + "\n", + "- Connect to Vantage\n", + "- Create DDLs\n", + "- Import Data\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "0528bd6a", + "metadata": {}, + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + "Host: tdprd.td.teradata.com\n", + "Username: wf250003\n", + "Password: ···········\n" + ] + } + ], + "source": [ + "from teradataml import create_context\n", + "import getpass\n", + "import logging\n", + "import sys\n", + "import urllib\n", + "\n", + "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n", + "\n", + "\n", + "host = input(\"Host:\")\n", + "username = input(\"Username:\")\n", + "password = getpass.getpass(\"Password:\")\n", + "\n", + "\n", + "engine = create_context(host=host, username=username, password=urllib.parse.quote(password), logmech=\"TDNEGO\")" + ] + }, + { + "cell_type": "markdown", + "id": "4eed19e0", + "metadata": {}, + "source": [ + "### Create DDLs\n", + "\n", + "Create the following tables \n", + "\n", + "- aoa_feature_metadata \n", + "- aoa_byom_models\n", + "- pima_patient_predictions\n", + "\n", + "`aoa_feature_metadata` is used to store the profiling metadata for the features so that we can consistently compute the data drift and model drift statistics. This table can also be created via the CLI by executing \n", + "\n", + "```bash\n", + "aoa feature create-stats-table -m .\n", + "```\n", + "\n", + "`pima_patient_predictions` is used for storing the predictions of the model scoring for the demo use case" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "9875d156", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from aoa import create_features_stats_table\n", + "from teradataml import get_context\n", + "\n", + "# Note: assuming we are using user database for training. If another database (e.g. datalab) is being used, please update.\n", + "# Also note, if a shared datalab is being used, only one user should execute the following DDL/DML commands\n", + "database = username\n", + "\n", + "create_features_stats_table(f\"{database}.aoa_feature_metadata\")\n", + "\n", + "get_context().execute(f\"\"\"\n", + "CREATE MULTISET TABLE {database}.aoa_byom_models\n", + " (\n", + " model_version VARCHAR(255),\n", + " model_id VARCHAR(255),\n", + " model_type VARCHAR(255),\n", + " project_id VARCHAR(255),\n", + " deployed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,\n", + " model BLOB(2097088000)\n", + " )\n", + " UNIQUE PRIMARY INDEX ( model_version );\n", + "\"\"\")\n", + "\n", + "get_context().execute(f\"\"\"\n", + "CREATE MULTISET TABLE {database}.pima_patient_predictions\n", + " (\n", + " job_id VARCHAR(255),\n", + " PatientId BIGINT,\n", + " HasDiabetes BIGINT,\n", + " json_report CLOB(1048544000) CHARACTER SET UNICODE\n", + " )\n", + " PRIMARY INDEX ( job_id );\n", + "\"\"\")" + ] + }, + { + "cell_type": "markdown", + "id": "b237d537", + "metadata": {}, + "source": [ + "### Import Data\n", + "\n", + "Create and import the data for the following two tables\n", + "\n", + "- pima_patient_features\n", + "- pima_patient_diagnoses\n", + "- aoa_feature_metadata\n", + "\n", + "`pima_patient_features` contains the features related to the patients medical history.\n", + "\n", + "`pima_patient_diagnoses` contains the diabetes diagnostic results for the patients.\n", + "\n", + "`aoa_feature_metadata` contains the feature statistics data for the `pima_patient_features` and `pima_patient_diagnoses`\n", + "\n", + "Note the `pima_patient_feature` can be populated via the CLI by executing \n", + "\n", + "```bash\n", + "aoa feature compute-stats -s .PIMA -m . -t continuous -c numtimesprg,plglcconc,bloodp,skinthick,twohourserins,bmi,dipedfunc,age \n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "07461699", + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml import copy_to_sql, DataFrame\n", + "from teradatasqlalchemy.types import *\n", + "import pandas as pd\n", + "\n", + "df = pd.read_csv(\"data/pima_patient_features.csv\")\n", + "copy_to_sql(df=df, \n", + " table_name=\"pima_patient_features\", \n", + " schema_name=database,\n", + " primary_index=\"PatientId\", \n", + " if_exists=\"replace\", \n", + " types={\n", + " \"PatientId\": INTEGER,\n", + " \"NumTimesPrg\": INTEGER, \n", + " \"PlGlcConc\": INTEGER,\n", + " \"BloodP\": INTEGER,\n", + " \"SkinThick\": INTEGER,\n", + " \"TwoHourSerIns\": INTEGER,\n", + " \"BMI\": FLOAT,\n", + " \"DiPedFunc\": FLOAT,\n", + " \"Age\": INTEGER\n", + " })\n", + "\n", + "df = pd.read_csv(\"data/pima_patient_diagnoses.csv\")\n", + "copy_to_sql(df=df, \n", + " table_name=\"pima_patient_diagnoses\", \n", + " schema_name=database,\n", + " primary_index=\"PatientId\", \n", + " if_exists=\"replace\", \n", + " types={\n", + " \"PatientId\": INTEGER,\n", + " \"HasDiabetes\": INTEGER\n", + " })\n", + "\n", + "# we can compute this from the CLI also - but lets import pre-computed for now.\n", + "df = pd.read_csv(\"data/aoa_feature_metadata.csv\")\n", + "copy_to_sql(df=df, \n", + " table_name=\"aoa_feature_metadata\", \n", + " schema_name=database,\n", + " if_exists=\"append\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "2b0cdd53", + "metadata": {}, + "source": [ + "## ModelOps UI\n", + "\n", + "#### Add Project\n", + "\n", + "- create project\n", + " - Details\n", + " - Name: Demo {your-name}\n", + " - Description: ModelOps Demo\n", + " - Group: {your-name}\n", + " - Path: https://github.com/Teradata/modelops-demo-models \n", + " - Credentials: No Credentials\n", + " - Branch: master\n", + " - Save And Continue\n", + " - Service Connection\n", + " - Skip for now\n", + " - Personal Connection\n", + " - Name: Vantage Personal {your-name}\n", + " - Description: Vantage Demo Env\n", + " - Host: {your-host}\n", + " - Database: {your-db}\n", + " - VAL Database: {your-val-db}\n", + " - BYOM Database: (your-byom-db}\n", + " - Login Mech: TDNEGO\n", + " - Username/Password\n", + " \n", + " \n", + "#### Add Datasets\n", + "\n", + "- create dataset template\n", + " - Catalog\n", + " - Name: PIMA\n", + " - Description: PIMA Diabetes\n", + " - Feature Catalog: Vantage\n", + " - Database: {your-db}\n", + " - Table: aoa_feature_metadata\n", + " - Features\n", + " - Query: `SELECT * FROM {your-db}.pima_patient_features`\n", + " - Entity Key: PatientId\n", + " - Features: NumTimesPrg, PlGlcConc, BloodP, SkinThick, TwoHourSerIns, BMI, DiPedFunc, Age\n", + " - Entity & Target\n", + " - Query: `SELECT * FROM {your-db}.pima_patient_diagnoses`\n", + " - Entity Key: PatientId\n", + " - Target: HasDiabetes\n", + " - Predictions\n", + " - Database: {your-db}\n", + " - Table: pima_patient_predictions\n", + " - Entity Selection: `SELECT * FROM pima_patient_features WHERE patientid MOD 5 = 0`\n", + " - BYOM Target Column: `CAST(CAST(json_report AS JSON).JSONExtractValue('$.predicted_HasDiabetes') AS INT)`\n", + " \n", + " \n", + "- create training dataset\n", + " - Basic\n", + " - Name: Train\n", + " - Description: Training dataset\n", + " - Scope: Training\n", + " - Entity & Target\n", + " - Query: `SELECT * FROM {your-db}.pima_patient_diagnoses WHERE patientid MOD 5 <> 0`\n", + " \n", + "- create evaluation dataset\n", + " - Basic\n", + " - Name: Evaluate\n", + " - Description: Evaluation dataset\n", + " - Scope: Evaluation\n", + " - Entity & Target\n", + " - Query: `SELECT * FROM {your-db}.pima_patient_diagnoses WHERE patientid MOD 5 = 0`\n", + " \n", + "\n", + "#### Model Lifecycle\n", + "\n", + "- Python Diabetes Prediction\n", + " - Train\n", + " - Evaluate\n", + " - Review evaluation report\n", + " - Approve \n", + " - Deploy \n", + " - Deployments/executions\n", + " - Retire\n", + "- R Diabetes Prediction\n", + " - Train\n", + " - Evaluate\n", + " - Review evaluation report\n", + " - Approve \n", + " - Deploy \n", + " - Deployments/executions\n", + " - Retire\n", + "- BYOM Diabetes Prediction\n", + " - Run BYOM Notebook \n", + " - Define BYOM Model \n", + " - Import Version\n", + " - Evaluate\n", + " - Review evaluation report\n", + " - Approve \n", + " - Deploy \n", + " - Deployments/executions\n", + " - Retire" + ] + }, + { + "cell_type": "markdown", + "id": "17a64068", + "metadata": {}, + "source": [ + "#### View Predictions\n", + "\n", + "In the next version of ModelOps, you will be able to view the predictions that follow the standard pattern directly via the UI. However, for now, we can view it here. As the same predictions table contains the predictions for all the jobs, we filter by the `airflow_job_id`. You can find this id in the UI under deployment executions." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "904b2fb9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
job_idPatientIdHasDiabetesjson_report
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [job_id, PatientId, HasDiabetes, json_report]\n", + "Index: []" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "from teradataml import get_connection\n", + "\n", + "pd.options.display.max_colwidth = 250\n", + "\n", + "airflow_job_id = \"5761d5c1-bf57-456b-8076-c3062be0b544-scheduled__2022-07-11T00:00:00+00:00\"\n", + "\n", + "pd.read_sql(f\"SELECT TOP 5 * FROM pima_patient_predictions WHERE job_id='{airflow_job_id}'\", get_connection())" + ] + }, + { + "cell_type": "markdown", + "id": "d479c9cb", + "metadata": {}, + "source": [ + "## CLI \n", + "\n", + "\n", + "```bash\n", + "pip install aoa==6.1.0\n", + "```\n", + "\n", + "##### Copy CLI Config\n", + "\n", + "```\n", + "Copy the CLI config from ModelOps UI -> Session Details -> CLI config\n", + "```\n", + "\n", + "##### Add Dataset Connection\n", + "\n", + "```bash\n", + "aoa connection add\n", + "```\n", + "\n", + "##### List Feature Metadata\n", + "\n", + "```bash\n", + "aoa feature list-stats -m {your-db}.aoa_feature_metadata\n", + "```\n", + "\n", + "##### Clone Project\n", + "\n", + "```bash\n", + "aoa clone \n", + "```\n", + "\n", + "```bash\n", + "cd modelops-demo-models\n", + "```\n", + "\n", + "##### Install Model Dependencies\n", + "\n", + "```bash\n", + "pip install -r model_definitions/python-diabetes/model_modules/requirements.txt\n", + "```\n", + "\n", + "##### Train Model\n", + "\n", + "```bash\n", + "aoa run\n", + "```\n", + "\n", + "##### Add Model\n", + "\n", + "```bash\n", + "aoa add\n", + "```\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b63bd4d5", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/quickstarts/modelops/attachments/ModelOps_Training_v7.ipynb b/quickstarts/modelops/attachments/ModelOps_Training_v7.ipynb new file mode 100644 index 0000000000..dc93b2d16e --- /dev/null +++ b/quickstarts/modelops/attachments/ModelOps_Training_v7.ipynb @@ -0,0 +1,410 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "dcc29d47", + "metadata": {}, + "source": [ + "## Setup\n", + "\n", + "The remainder of the notebook runs through the following steps\n", + "\n", + "- Connect to Vantage\n", + "- Create DDLs\n", + "- Import Data\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "426c443a", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install teradataml==17.20.0.3 aoa==7.0.1 pandas==1.1.5" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8a780585", + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml import create_context\n", + "import getpass\n", + "import logging\n", + "import sys\n", + "import urllib\n", + "\n", + "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n", + "\n", + "\n", + "host = input(\"Host:\")\n", + "username = input(\"Username:\")\n", + "password = getpass.getpass(\"Password:\")\n", + "database = input(\"Database (defaults to user):\")\n", + "\n", + "if not database:\n", + " database = username\n", + "\n", + "\n", + "engine = create_context(host=host, \n", + " username=username, \n", + " password=urllib.parse.quote(password), \n", + " logmech=\"TDNEGO\",\n", + " database=database)" + ] + }, + { + "cell_type": "markdown", + "id": "88d3dff4", + "metadata": {}, + "source": [ + "### Create DDLs\n", + "\n", + "Create the following tables \n", + "\n", + "- aoa_statistics_metadata \n", + "- aoa_byom_models\n", + "- pima_patient_predictions\n", + "\n", + "`aoa_statistics_metadata` is used to store the profiling metadata for the features so that we can consistently compute the data drift and model drift statistics. This table can also be created via the CLI by executing \n", + "\n", + "```bash\n", + "aoa feature create-stats-table -e -m .\n", + "```\n", + "\n", + "`pima_patient_predictions` is used for storing the predictions of the model scoring for the demo use case" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "769f5cfe", + "metadata": {}, + "outputs": [], + "source": [ + "from aoa import create_features_stats_table\n", + "from teradataml import get_context\n", + "\n", + "# Note: assuming we are using user database for training. If another database (e.g. datalab) is being used, please update.\n", + "# Also note, if a shared datalab is being used, only one user should execute the following DDL/DML commands\n", + "database = username\n", + "\n", + "create_features_stats_table(f\"{database}.aoa_statistics_metadata\")\n", + "\n", + "get_context().execute(f\"\"\"\n", + "CREATE MULTISET TABLE {database}.aoa_byom_models\n", + " (\n", + " model_version VARCHAR(255),\n", + " model_id VARCHAR(255),\n", + " model_type VARCHAR(255),\n", + " project_id VARCHAR(255),\n", + " deployed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,\n", + " model BLOB(2097088000)\n", + " )\n", + " UNIQUE PRIMARY INDEX ( model_version );\n", + "\"\"\")\n", + "\n", + "get_context().execute(f\"\"\"\n", + "CREATE MULTISET TABLE {database}.pima_patient_predictions\n", + " (\n", + " job_id VARCHAR(255),\n", + " PatientId BIGINT,\n", + " HasDiabetes BIGINT,\n", + " json_report CLOB(1048544000) CHARACTER SET UNICODE\n", + " )\n", + " PRIMARY INDEX ( job_id );\n", + "\"\"\")" + ] + }, + { + "cell_type": "markdown", + "id": "520b92c2", + "metadata": {}, + "source": [ + "### Import Data\n", + "\n", + "Create and import the data for the following two tables\n", + "\n", + "- pima_patient_features\n", + "- pima_patient_diagnoses\n", + "- aoa_statistics_metadata\n", + "\n", + "`pima_patient_features` contains the features related to the patients medical history.\n", + "\n", + "`pima_patient_diagnoses` contains the diabetes diagnostic results for the patients.\n", + "\n", + "`aoa_statistics_metadata` contains the feature statistics metadata for the `pima_patient_features` and `pima_patient_diagnoses`\n", + "\n", + "Note the `pima_patient_feature` can be populated via the CLI by executing \n", + "\n", + "Compute the statistics metadata for the continuous variables\n", + "```bash\n", + "aoa feature compute-stats \\\n", + " -s . \\\n", + " -m . \\\n", + " -t continuous -c numtimesprg,plglcconc,bloodp,skinthick,twohourserins,bmi,dipedfunc,age\n", + "```\n", + "\n", + "Compute the statistics metadata for the categorical variables\n", + "```bash\n", + "aoa feature compute-stats \\\n", + " -s . \\\n", + " -m . \\\n", + " -t categorical -c hasdiabetes\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9dca7bd3", + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml import copy_to_sql, DataFrame\n", + "from teradatasqlalchemy.types import *\n", + "import pandas as pd\n", + "\n", + "df = pd.read_csv(\"data/pima_patient_features.csv\")\n", + "copy_to_sql(df=df, \n", + " table_name=\"pima_patient_features\", \n", + " schema_name=database,\n", + " primary_index=\"PatientId\", \n", + " if_exists=\"replace\", \n", + " types={\n", + " \"PatientId\": INTEGER,\n", + " \"NumTimesPrg\": INTEGER, \n", + " \"PlGlcConc\": INTEGER,\n", + " \"BloodP\": INTEGER,\n", + " \"SkinThick\": INTEGER,\n", + " \"TwoHourSerIns\": INTEGER,\n", + " \"BMI\": FLOAT,\n", + " \"DiPedFunc\": FLOAT,\n", + " \"Age\": INTEGER\n", + " })\n", + "\n", + "df = pd.read_csv(\"data/pima_patient_diagnoses.csv\")\n", + "copy_to_sql(df=df, \n", + " table_name=\"pima_patient_diagnoses\", \n", + " schema_name=database,\n", + " primary_index=\"PatientId\", \n", + " if_exists=\"replace\", \n", + " types={\n", + " \"PatientId\": INTEGER,\n", + " \"HasDiabetes\": INTEGER\n", + " })\n", + "\n", + "# we can compute this from the CLI also - but lets import pre-computed for now.\n", + "df = pd.read_csv(\"data/aoa_statistics_metadata.csv\")\n", + "copy_to_sql(df=df, \n", + " table_name=\"aoa_statistics_metadata\", \n", + " schema_name=database,\n", + " if_exists=\"append\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "97d65765", + "metadata": {}, + "source": [ + "## ModelOps UI\n", + "\n", + "#### Add Project\n", + "\n", + "- create project\n", + " - Details\n", + " - Name: Demo {your-name}\n", + " - Description: ModelOps Demo\n", + " - Group: {your-name}\n", + " - Path: https://github.com/Teradata/modelops-demo-models \n", + " - Credentials: No Credentials\n", + " - Branch: master\n", + " - Save And Continue\n", + " - Service Connection\n", + " - Skip for now\n", + " - Personal Connection\n", + " - Name: Vantage Personal {your-name}\n", + " - Description: Vantage Demo Env\n", + " - Host: {your-host}\n", + " - Database: {your-db}\n", + " - VAL Database: {your-val-db}\n", + " - BYOM Database: (your-byom-db}\n", + " - Login Mech: TDNEGO\n", + " - Username/Password\n", + " \n", + " \n", + "#### Add Datasets\n", + "\n", + "- create dataset template\n", + " - Catalog\n", + " - Name: PIMA\n", + " - Description: PIMA Diabetes\n", + " - Feature Catalog: Vantage\n", + " - Database: {your-db}\n", + " - Table: aoa_statistics_metadata\n", + " - Features\n", + " - Query: `SELECT * FROM {your-db}.pima_patient_features`\n", + " - Entity Key: PatientId\n", + " - Features: NumTimesPrg, PlGlcConc, BloodP, SkinThick, TwoHourSerIns, BMI, DiPedFunc, Age\n", + " - Entity & Target\n", + " - Query: `SELECT * FROM {your-db}.pima_patient_diagnoses`\n", + " - Entity Key: PatientId\n", + " - Target: HasDiabetes\n", + " - Predictions\n", + " - Database: {your-db}\n", + " - Table: pima_patient_predictions\n", + " - Entity Selection: `SELECT * FROM pima_patient_features WHERE patientid MOD 5 = 0`\n", + " \n", + " \n", + "- create training dataset\n", + " - Basic\n", + " - Name: Train\n", + " - Description: Training dataset\n", + " - Scope: Training\n", + " - Entity & Target\n", + " - Query: `SELECT * FROM {your-db}.pima_patient_diagnoses WHERE patientid MOD 5 <> 0`\n", + " \n", + "- create evaluation dataset\n", + " - Basic\n", + " - Name: Evaluate\n", + " - Description: Evaluation dataset\n", + " - Scope: Evaluation\n", + " - Entity & Target\n", + " - Query: `SELECT * FROM {your-db}.pima_patient_diagnoses WHERE patientid MOD 5 = 0`\n", + " \n", + "\n", + "#### Model Lifecycle\n", + "\n", + "- Python Diabetes Prediction\n", + " - Train\n", + " - Evaluate\n", + " - Review evaluation report\n", + " - Approve \n", + " - Deploy \n", + " - Deployments/executions\n", + " - Retire\n", + "- R Diabetes Prediction\n", + " - Train\n", + " - Evaluate\n", + " - Review evaluation report\n", + " - Approve \n", + " - Deploy \n", + " - Deployments/executions\n", + " - Retire\n", + "- BYOM Diabetes Prediction\n", + " - Run BYOM Notebook \n", + " - Define BYOM Model \n", + " - Import Version\n", + " - Evaluate\n", + " - Review evaluation report\n", + " - Approve \n", + " - Deploy \n", + " - Deployments/executions\n", + " - Retire" + ] + }, + { + "cell_type": "markdown", + "id": "be1b4671", + "metadata": {}, + "source": [ + "#### View Predictions\n", + "\n", + "In the UI, select a deployment from the deployments left hand navigation. Go to the Jobs tab and on the right hand side for each job execution, you can select \"View Predictions\". This will show you a sample of the predictions for that particular job execution.\n", + "\n", + "Note, your predictions table must have a `job_id` column which matches to the execution job id. If using BYOM, this is done automatically. For you own `scoring.py`, checkout the demo models." + ] + }, + { + "cell_type": "markdown", + "id": "6b812b27", + "metadata": {}, + "source": [ + "## CLI \n", + "\n", + "\n", + "```bash\n", + "pip install aoa>=7.0.0rc3\n", + "```\n", + "\n", + "##### Copy CLI Config\n", + "\n", + "```\n", + "Copy the CLI config from ModelOps UI -> Session Details -> CLI config\n", + "```\n", + "\n", + "##### Add Dataset Connection\n", + "\n", + "```bash\n", + "aoa connection add\n", + "```\n", + "\n", + "##### List Feature Metadata\n", + "\n", + "```bash\n", + "aoa feature list-stats -m {your-db}.aoa_feature_metadata\n", + "```\n", + "\n", + "##### Clone Project\n", + "\n", + "```bash\n", + "aoa clone \n", + "```\n", + "\n", + "```bash\n", + "cd modelops-demo-models\n", + "```\n", + "\n", + "##### Install Model Dependencies\n", + "\n", + "```bash\n", + "pip install -r model_definitions/python-diabetes/model_modules/requirements.txt\n", + "```\n", + "\n", + "##### Train Model\n", + "\n", + "```bash\n", + "aoa run\n", + "```\n", + "\n", + "##### Add Model\n", + "\n", + "```bash\n", + "aoa add\n", + "```\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "99270257", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/quickstarts/modelops/images/BYOM.png b/quickstarts/modelops/images/BYOM.png new file mode 100644 index 0000000000..9b1bf00f9b Binary files /dev/null and b/quickstarts/modelops/images/BYOM.png differ diff --git a/quickstarts/modelops/images/Define_BYOM_Model.png b/quickstarts/modelops/images/Define_BYOM_Model.png new file mode 100644 index 0000000000..b9529f93a1 Binary files /dev/null and b/quickstarts/modelops/images/Define_BYOM_Model.png differ diff --git a/quickstarts/modelops/images/ModelOps_Healthcheck.png b/quickstarts/modelops/images/ModelOps_Healthcheck.png new file mode 100644 index 0000000000..22d9a47367 Binary files /dev/null and b/quickstarts/modelops/images/ModelOps_Healthcheck.png differ diff --git a/quickstarts/modelops/images/Personal_Connection.png b/quickstarts/modelops/images/Personal_Connection.png new file mode 100644 index 0000000000..0bccd123e9 Binary files /dev/null and b/quickstarts/modelops/images/Personal_Connection.png differ diff --git a/quickstarts/modelops/images/Project_Creating.png b/quickstarts/modelops/images/Project_Creating.png new file mode 100644 index 0000000000..b9c40bd0a1 Binary files /dev/null and b/quickstarts/modelops/images/Project_Creating.png differ diff --git a/quickstarts/modelops/images/alert_configuration.png b/quickstarts/modelops/images/alert_configuration.png new file mode 100644 index 0000000000..15f3a92fd5 Binary files /dev/null and b/quickstarts/modelops/images/alert_configuration.png differ diff --git a/quickstarts/modelops/images/alert_configuration2.png b/quickstarts/modelops/images/alert_configuration2.png new file mode 100644 index 0000000000..3a5ce5e259 Binary files /dev/null and b/quickstarts/modelops/images/alert_configuration2.png differ diff --git a/quickstarts/modelops/images/alert_configuration3.png b/quickstarts/modelops/images/alert_configuration3.png new file mode 100644 index 0000000000..22e47062c2 Binary files /dev/null and b/quickstarts/modelops/images/alert_configuration3.png differ diff --git a/quickstarts/modelops/images/alert_configuration4.png b/quickstarts/modelops/images/alert_configuration4.png new file mode 100644 index 0000000000..17a3592045 Binary files /dev/null and b/quickstarts/modelops/images/alert_configuration4.png differ diff --git a/quickstarts/modelops/images/alert_new1.png b/quickstarts/modelops/images/alert_new1.png new file mode 100644 index 0000000000..7015a8061a Binary files /dev/null and b/quickstarts/modelops/images/alert_new1.png differ diff --git a/quickstarts/modelops/images/alert_new2.png b/quickstarts/modelops/images/alert_new2.png new file mode 100644 index 0000000000..3c8f66b1d3 Binary files /dev/null and b/quickstarts/modelops/images/alert_new2.png differ diff --git a/quickstarts/modelops/images/alert_new3.png b/quickstarts/modelops/images/alert_new3.png new file mode 100644 index 0000000000..8daa9d7510 Binary files /dev/null and b/quickstarts/modelops/images/alert_new3.png differ diff --git a/quickstarts/modelops/images/byom_basic.png b/quickstarts/modelops/images/byom_basic.png new file mode 100644 index 0000000000..84d55bf893 Binary files /dev/null and b/quickstarts/modelops/images/byom_basic.png differ diff --git a/quickstarts/modelops/images/byom_meth.png b/quickstarts/modelops/images/byom_meth.png new file mode 100644 index 0000000000..a699ba7ca7 Binary files /dev/null and b/quickstarts/modelops/images/byom_meth.png differ diff --git a/quickstarts/modelops/images/byom_model.png b/quickstarts/modelops/images/byom_model.png new file mode 100644 index 0000000000..054f0698c9 Binary files /dev/null and b/quickstarts/modelops/images/byom_model.png differ diff --git a/quickstarts/modelops/images/byom_monitoring1.png b/quickstarts/modelops/images/byom_monitoring1.png new file mode 100644 index 0000000000..95cde0a11e Binary files /dev/null and b/quickstarts/modelops/images/byom_monitoring1.png differ diff --git a/quickstarts/modelops/images/byom_monitoring2.png b/quickstarts/modelops/images/byom_monitoring2.png new file mode 100644 index 0000000000..15843f904e Binary files /dev/null and b/quickstarts/modelops/images/byom_monitoring2.png differ diff --git a/quickstarts/modelops/images/byom_monitoring_3.png b/quickstarts/modelops/images/byom_monitoring_3.png new file mode 100644 index 0000000000..26cc16eaa0 Binary files /dev/null and b/quickstarts/modelops/images/byom_monitoring_3.png differ diff --git a/quickstarts/modelops/images/byom_monitoring_save.png b/quickstarts/modelops/images/byom_monitoring_save.png new file mode 100644 index 0000000000..f18a6f5569 Binary files /dev/null and b/quickstarts/modelops/images/byom_monitoring_save.png differ diff --git a/quickstarts/modelops/images/dataset_template.png b/quickstarts/modelops/images/dataset_template.png new file mode 100644 index 0000000000..990a8455ca Binary files /dev/null and b/quickstarts/modelops/images/dataset_template.png differ diff --git a/quickstarts/modelops/images/dataset_template2.png b/quickstarts/modelops/images/dataset_template2.png new file mode 100644 index 0000000000..122892cb4f Binary files /dev/null and b/quickstarts/modelops/images/dataset_template2.png differ diff --git a/quickstarts/modelops/images/dataset_template_features.png b/quickstarts/modelops/images/dataset_template_features.png new file mode 100644 index 0000000000..273acdc282 Binary files /dev/null and b/quickstarts/modelops/images/dataset_template_features.png differ diff --git a/quickstarts/modelops/images/dataset_template_prediction.png b/quickstarts/modelops/images/dataset_template_prediction.png new file mode 100644 index 0000000000..5b60a569cf Binary files /dev/null and b/quickstarts/modelops/images/dataset_template_prediction.png differ diff --git a/quickstarts/modelops/images/dataset_template_target.png b/quickstarts/modelops/images/dataset_template_target.png new file mode 100644 index 0000000000..5f4094d8ec Binary files /dev/null and b/quickstarts/modelops/images/dataset_template_target.png differ diff --git a/quickstarts/modelops/images/datasets_created.png b/quickstarts/modelops/images/datasets_created.png new file mode 100644 index 0000000000..adb5b4ff93 Binary files /dev/null and b/quickstarts/modelops/images/datasets_created.png differ diff --git a/quickstarts/modelops/images/define_new.png b/quickstarts/modelops/images/define_new.png new file mode 100644 index 0000000000..a9952d123d Binary files /dev/null and b/quickstarts/modelops/images/define_new.png differ diff --git a/quickstarts/modelops/images/deploy.png b/quickstarts/modelops/images/deploy.png new file mode 100644 index 0000000000..d895dcea09 Binary files /dev/null and b/quickstarts/modelops/images/deploy.png differ diff --git a/quickstarts/modelops/images/deploy_details1.png b/quickstarts/modelops/images/deploy_details1.png new file mode 100644 index 0000000000..46d97a93c2 Binary files /dev/null and b/quickstarts/modelops/images/deploy_details1.png differ diff --git a/quickstarts/modelops/images/deploy_details2.png b/quickstarts/modelops/images/deploy_details2.png new file mode 100644 index 0000000000..4ea6e9d0a2 Binary files /dev/null and b/quickstarts/modelops/images/deploy_details2.png differ diff --git a/quickstarts/modelops/images/deploy_details3.png b/quickstarts/modelops/images/deploy_details3.png new file mode 100644 index 0000000000..4c7961b2b0 Binary files /dev/null and b/quickstarts/modelops/images/deploy_details3.png differ diff --git a/quickstarts/modelops/images/deploy_job.png b/quickstarts/modelops/images/deploy_job.png new file mode 100644 index 0000000000..848d3e5bff Binary files /dev/null and b/quickstarts/modelops/images/deploy_job.png differ diff --git a/quickstarts/modelops/images/deployment_evaluate.png b/quickstarts/modelops/images/deployment_evaluate.png new file mode 100644 index 0000000000..7b815a3ae9 Binary files /dev/null and b/quickstarts/modelops/images/deployment_evaluate.png differ diff --git a/quickstarts/modelops/images/deployment_evaluate2.png b/quickstarts/modelops/images/deployment_evaluate2.png new file mode 100644 index 0000000000..72e01fc81a Binary files /dev/null and b/quickstarts/modelops/images/deployment_evaluate2.png differ diff --git a/quickstarts/modelops/images/deployment_jobs.png b/quickstarts/modelops/images/deployment_jobs.png new file mode 100644 index 0000000000..7f03703fb2 Binary files /dev/null and b/quickstarts/modelops/images/deployment_jobs.png differ diff --git a/quickstarts/modelops/images/deployment_jobs2.png b/quickstarts/modelops/images/deployment_jobs2.png new file mode 100644 index 0000000000..dd87da9b6a Binary files /dev/null and b/quickstarts/modelops/images/deployment_jobs2.png differ diff --git a/quickstarts/modelops/images/deployment_predictions.png b/quickstarts/modelops/images/deployment_predictions.png new file mode 100644 index 0000000000..8a647adf15 Binary files /dev/null and b/quickstarts/modelops/images/deployment_predictions.png differ diff --git a/quickstarts/modelops/images/deployments.png b/quickstarts/modelops/images/deployments.png new file mode 100644 index 0000000000..ce5ea1465c Binary files /dev/null and b/quickstarts/modelops/images/deployments.png differ diff --git a/quickstarts/modelops/images/enable_alerts.png b/quickstarts/modelops/images/enable_alerts.png new file mode 100644 index 0000000000..20d9b78b81 Binary files /dev/null and b/quickstarts/modelops/images/enable_alerts.png differ diff --git a/quickstarts/modelops/images/evaluation2.png b/quickstarts/modelops/images/evaluation2.png new file mode 100644 index 0000000000..5e0d6dd609 Binary files /dev/null and b/quickstarts/modelops/images/evaluation2.png differ diff --git a/quickstarts/modelops/images/evaluation2_detail.png b/quickstarts/modelops/images/evaluation2_detail.png new file mode 100644 index 0000000000..e99fd0c76c Binary files /dev/null and b/quickstarts/modelops/images/evaluation2_detail.png differ diff --git a/quickstarts/modelops/images/evaluation_dataset.png b/quickstarts/modelops/images/evaluation_dataset.png new file mode 100644 index 0000000000..3692382998 Binary files /dev/null and b/quickstarts/modelops/images/evaluation_dataset.png differ diff --git a/quickstarts/modelops/images/evaluation_dataset_basic.png b/quickstarts/modelops/images/evaluation_dataset_basic.png new file mode 100644 index 0000000000..0b9127b013 Binary files /dev/null and b/quickstarts/modelops/images/evaluation_dataset_basic.png differ diff --git a/quickstarts/modelops/images/evaluation_job.png b/quickstarts/modelops/images/evaluation_job.png new file mode 100644 index 0000000000..a56544d3a0 Binary files /dev/null and b/quickstarts/modelops/images/evaluation_job.png differ diff --git a/quickstarts/modelops/images/evaluation_report.png b/quickstarts/modelops/images/evaluation_report.png new file mode 100644 index 0000000000..9c250771e0 Binary files /dev/null and b/quickstarts/modelops/images/evaluation_report.png differ diff --git a/quickstarts/modelops/images/evaluation_report2.png b/quickstarts/modelops/images/evaluation_report2.png new file mode 100644 index 0000000000..e2de8f665a Binary files /dev/null and b/quickstarts/modelops/images/evaluation_report2.png differ diff --git a/quickstarts/modelops/images/execute-airflow-workflows-with-clearscape-analytics-modelops-model-factory-solution/DAG_graph.png b/quickstarts/modelops/images/execute-airflow-workflows-with-clearscape-analytics-modelops-model-factory-solution/DAG_graph.png new file mode 100644 index 0000000000..69dc4cddfc Binary files /dev/null and b/quickstarts/modelops/images/execute-airflow-workflows-with-clearscape-analytics-modelops-model-factory-solution/DAG_graph.png differ diff --git a/quickstarts/modelops/images/execute-airflow-workflows-with-clearscape-analytics-modelops-model-factory-solution/DAGs.png b/quickstarts/modelops/images/execute-airflow-workflows-with-clearscape-analytics-modelops-model-factory-solution/DAGs.png new file mode 100644 index 0000000000..6f17a8ea20 Binary files /dev/null and b/quickstarts/modelops/images/execute-airflow-workflows-with-clearscape-analytics-modelops-model-factory-solution/DAGs.png differ diff --git a/quickstarts/modelops/images/execute-airflow-workflows-with-clearscape-analytics-modelops-model-factory-solution/LoginPage.png b/quickstarts/modelops/images/execute-airflow-workflows-with-clearscape-analytics-modelops-model-factory-solution/LoginPage.png new file mode 100644 index 0000000000..5ba39af71b Binary files /dev/null and b/quickstarts/modelops/images/execute-airflow-workflows-with-clearscape-analytics-modelops-model-factory-solution/LoginPage.png differ diff --git a/quickstarts/modelops/images/execute-airflow-workflows-with-clearscape-analytics-modelops-model-factory-solution/Workflow.png b/quickstarts/modelops/images/execute-airflow-workflows-with-clearscape-analytics-modelops-model-factory-solution/Workflow.png new file mode 100644 index 0000000000..8d68be2d3f Binary files /dev/null and b/quickstarts/modelops/images/execute-airflow-workflows-with-clearscape-analytics-modelops-model-factory-solution/Workflow.png differ diff --git a/quickstarts/modelops/images/execute-airflow-workflows-with-clearscape-analytics-modelops-model-factory-solution/modelOps1.png b/quickstarts/modelops/images/execute-airflow-workflows-with-clearscape-analytics-modelops-model-factory-solution/modelOps1.png new file mode 100644 index 0000000000..7d8fb964ca Binary files /dev/null and b/quickstarts/modelops/images/execute-airflow-workflows-with-clearscape-analytics-modelops-model-factory-solution/modelOps1.png differ diff --git a/quickstarts/modelops/images/execute-airflow-workflows-with-clearscape-analytics-modelops-model-factory-solution/successTasks.png b/quickstarts/modelops/images/execute-airflow-workflows-with-clearscape-analytics-modelops-model-factory-solution/successTasks.png new file mode 100644 index 0000000000..c2e0c8c6f2 Binary files /dev/null and b/quickstarts/modelops/images/execute-airflow-workflows-with-clearscape-analytics-modelops-model-factory-solution/successTasks.png differ diff --git a/quickstarts/modelops/images/feature_drift.png b/quickstarts/modelops/images/feature_drift.png new file mode 100644 index 0000000000..957cc0b502 Binary files /dev/null and b/quickstarts/modelops/images/feature_drift.png differ diff --git a/quickstarts/modelops/images/go.png b/quickstarts/modelops/images/go.png new file mode 100644 index 0000000000..6d18df6f23 Binary files /dev/null and b/quickstarts/modelops/images/go.png differ diff --git a/quickstarts/modelops/images/healthcheck.png b/quickstarts/modelops/images/healthcheck.png new file mode 100644 index 0000000000..5b4aa6cc06 Binary files /dev/null and b/quickstarts/modelops/images/healthcheck.png differ diff --git a/quickstarts/modelops/images/jobs.png b/quickstarts/modelops/images/jobs.png new file mode 100644 index 0000000000..eb65981e1c Binary files /dev/null and b/quickstarts/modelops/images/jobs.png differ diff --git a/quickstarts/modelops/images/model_evaluate.png b/quickstarts/modelops/images/model_evaluate.png new file mode 100644 index 0000000000..8895e28286 Binary files /dev/null and b/quickstarts/modelops/images/model_evaluate.png differ diff --git a/quickstarts/modelops/images/model_evaluate2.png b/quickstarts/modelops/images/model_evaluate2.png new file mode 100644 index 0000000000..cfa64bdd77 Binary files /dev/null and b/quickstarts/modelops/images/model_evaluate2.png differ diff --git a/quickstarts/modelops/images/model_version.png b/quickstarts/modelops/images/model_version.png new file mode 100644 index 0000000000..9bcc58445c Binary files /dev/null and b/quickstarts/modelops/images/model_version.png differ diff --git a/quickstarts/modelops/images/modelops-git.png b/quickstarts/modelops/images/modelops-git.png new file mode 100644 index 0000000000..e4d7ab3431 Binary files /dev/null and b/quickstarts/modelops/images/modelops-git.png differ diff --git a/quickstarts/modelops/images/performance.png b/quickstarts/modelops/images/performance.png new file mode 100644 index 0000000000..b45149ba52 Binary files /dev/null and b/quickstarts/modelops/images/performance.png differ diff --git a/quickstarts/modelops/images/personal1.png b/quickstarts/modelops/images/personal1.png new file mode 100644 index 0000000000..fdfa547c83 Binary files /dev/null and b/quickstarts/modelops/images/personal1.png differ diff --git a/quickstarts/modelops/images/prediction_drift.png b/quickstarts/modelops/images/prediction_drift.png new file mode 100644 index 0000000000..24bab4166b Binary files /dev/null and b/quickstarts/modelops/images/prediction_drift.png differ diff --git a/quickstarts/modelops/images/projects.png b/quickstarts/modelops/images/projects.png new file mode 100644 index 0000000000..4e404e916e Binary files /dev/null and b/quickstarts/modelops/images/projects.png differ diff --git a/quickstarts/modelops/images/projects_quickstart.png b/quickstarts/modelops/images/projects_quickstart.png new file mode 100644 index 0000000000..f8c5e16823 Binary files /dev/null and b/quickstarts/modelops/images/projects_quickstart.png differ diff --git a/quickstarts/modelops/images/save_continue.png b/quickstarts/modelops/images/save_continue.png new file mode 100644 index 0000000000..d83105061f Binary files /dev/null and b/quickstarts/modelops/images/save_continue.png differ diff --git a/quickstarts/modelops/images/statistics_job.png b/quickstarts/modelops/images/statistics_job.png new file mode 100644 index 0000000000..b0b927c2d8 Binary files /dev/null and b/quickstarts/modelops/images/statistics_job.png differ diff --git a/quickstarts/modelops/images/training_dataset.png b/quickstarts/modelops/images/training_dataset.png new file mode 100644 index 0000000000..f00845b841 Binary files /dev/null and b/quickstarts/modelops/images/training_dataset.png differ diff --git a/quickstarts/modelops/images/training_dataset_basic.png b/quickstarts/modelops/images/training_dataset_basic.png new file mode 100644 index 0000000000..e77d86127b Binary files /dev/null and b/quickstarts/modelops/images/training_dataset_basic.png differ diff --git a/quickstarts/modelops/images/view_details.png b/quickstarts/modelops/images/view_details.png new file mode 100644 index 0000000000..195e77c01e Binary files /dev/null and b/quickstarts/modelops/images/view_details.png differ diff --git a/quickstarts/other-integrations/attachments/execute-airflow-workflows-that-use-dbt-with-teradata-vantage/Dockerfile b/quickstarts/other-integrations/attachments/execute-airflow-workflows-that-use-dbt-with-teradata-vantage/Dockerfile new file mode 100644 index 0000000000..dcfb8954c6 --- /dev/null +++ b/quickstarts/other-integrations/attachments/execute-airflow-workflows-that-use-dbt-with-teradata-vantage/Dockerfile @@ -0,0 +1,18 @@ +FROM apache/airflow:2.3.2 + +USER root + +RUN sudo apt update && \ + sudo apt-get install -y git + + +USER airflow + +RUN pip install sqlalchemy && \ + pip install sqlalchemy-teradata && \ + pip install teradatasql && \ + pip install teradatasqlalchemy && \ + pip install dbt-teradata && \ + pip install boto3 && \ + pip install ijson && \ + pip install --no-cache-dir awscli diff --git a/quickstarts/other-integrations/attachments/execute-airflow-workflows-that-use-dbt-with-teradata-vantage/airflow.cfg b/quickstarts/other-integrations/attachments/execute-airflow-workflows-that-use-dbt-with-teradata-vantage/airflow.cfg new file mode 100644 index 0000000000..ec8324bf37 --- /dev/null +++ b/quickstarts/other-integrations/attachments/execute-airflow-workflows-that-use-dbt-with-teradata-vantage/airflow.cfg @@ -0,0 +1,1112 @@ +[core] +# The folder where your airflow pipelines live, most likely a +# subfolder in a code repository. This path must be absolute. +dags_folder = /opt/airflow/dags + +# Hostname by providing a path to a callable, which will resolve the hostname. +# The format is "package.function". +# +# For example, default value "socket.getfqdn" means that result from getfqdn() of "socket" +# package will be used as hostname. +# +# No argument should be required in the function specified. +# If using IP address as hostname is preferred, use value ``airflow.utils.net.get_host_ip_address`` +hostname_callable = socket.getfqdn + +# Default timezone in case supplied date times are naive +# can be utc (default), system, or any IANA timezone string (e.g. Europe/Amsterdam) +default_timezone = utc + +# The executor class that airflow should use. Choices include +# ``SequentialExecutor``, ``LocalExecutor``, ``CeleryExecutor``, ``DaskExecutor``, +# ``KubernetesExecutor``, ``CeleryKubernetesExecutor`` or the +# full import path to the class when using a custom executor. +executor = SequentialExecutor + +# The SqlAlchemy connection string to the metadata database. +# SqlAlchemy supports many different database engines. +# More information here: +# http://airflow.apache.org/docs/apache-airflow/stable/howto/set-up-database.html#database-uri +sql_alchemy_conn = sqlite:////opt/airflow/airflow.db + +# The encoding for the databases +sql_engine_encoding = utf-8 + +# Collation for ``dag_id``, ``task_id``, ``key`` columns in case they have different encoding. +# By default this collation is the same as the database collation, however for ``mysql`` and ``mariadb`` +# the default is ``utf8mb3_bin`` so that the index sizes of our index keys will not exceed +# the maximum size of allowed index when collation is set to ``utf8mb4`` variant +# (see https://github.com/apache/airflow/pull/17603#issuecomment-901121618). +# sql_engine_collation_for_ids = + +# If SqlAlchemy should pool database connections. +sql_alchemy_pool_enabled = True + +# The SqlAlchemy pool size is the maximum number of database connections +# in the pool. 0 indicates no limit. +sql_alchemy_pool_size = 5 + +# The maximum overflow size of the pool. +# When the number of checked-out connections reaches the size set in pool_size, +# additional connections will be returned up to this limit. +# When those additional connections are returned to the pool, they are disconnected and discarded. +# It follows then that the total number of simultaneous connections the pool will allow +# is pool_size + max_overflow, +# and the total number of "sleeping" connections the pool will allow is pool_size. +# max_overflow can be set to ``-1`` to indicate no overflow limit; +# no limit will be placed on the total number of concurrent connections. Defaults to ``10``. +sql_alchemy_max_overflow = 10 + +# The SqlAlchemy pool recycle is the number of seconds a connection +# can be idle in the pool before it is invalidated. This config does +# not apply to sqlite. If the number of DB connections is ever exceeded, +# a lower config value will allow the system to recover faster. +sql_alchemy_pool_recycle = 1800 + +# Check connection at the start of each connection pool checkout. +# Typically, this is a simple statement like "SELECT 1". +# More information here: +# https://docs.sqlalchemy.org/en/13/core/pooling.html#disconnect-handling-pessimistic +sql_alchemy_pool_pre_ping = True + +# The schema to use for the metadata database. +# SqlAlchemy supports databases with the concept of multiple schemas. +sql_alchemy_schema = + +# Import path for connect args in SqlAlchemy. Defaults to an empty dict. +# This is useful when you want to configure db engine args that SqlAlchemy won't parse +# in connection string. +# See https://docs.sqlalchemy.org/en/13/core/engines.html#sqlalchemy.create_engine.params.connect_args +# sql_alchemy_connect_args = + +# This defines the maximum number of task instances that can run concurrently in Airflow +# regardless of scheduler count and worker count. Generally, this value is reflective of +# the number of task instances with the running state in the metadata database. +parallelism = 32 + +# The maximum number of task instances allowed to run concurrently in each DAG. To calculate +# the number of tasks that is running concurrently for a DAG, add up the number of running +# tasks for all DAG runs of the DAG. This is configurable at the DAG level with ``max_active_tasks``, +# which is defaulted as ``max_active_tasks_per_dag``. +# +# An example scenario when this would be useful is when you want to stop a new dag with an early +# start date from stealing all the executor slots in a cluster. +max_active_tasks_per_dag = 16 + +# Are DAGs paused by default at creation +dags_are_paused_at_creation = True + +# The maximum number of active DAG runs per DAG. The scheduler will not create more DAG runs +# if it reaches the limit. This is configurable at the DAG level with ``max_active_runs``, +# which is defaulted as ``max_active_runs_per_dag``. +max_active_runs_per_dag = 16 + +# Whether to load the DAG examples that ship with Airflow. It's good to +# get started, but you probably want to set this to ``False`` in a production +# environment +load_examples = True + +# Whether to load the default connections that ship with Airflow. It's good to +# get started, but you probably want to set this to ``False`` in a production +# environment +load_default_connections = True + +# Path to the folder containing Airflow plugins +plugins_folder = /opt/airflow/plugins + +# Should tasks be executed via forking of the parent process ("False", +# the speedier option) or by spawning a new python process ("True" slow, +# but means plugin changes picked up by tasks straight away) +execute_tasks_new_python_interpreter = False + +# Secret key to save connection passwords in the db +fernet_key = + +# Whether to disable pickling dags +donot_pickle = True + +# How long before timing out a python file import +dagbag_import_timeout = 30.0 + +# Should a traceback be shown in the UI for dagbag import errors, +# instead of just the exception message +dagbag_import_error_tracebacks = True + +# If tracebacks are shown, how many entries from the traceback should be shown +dagbag_import_error_traceback_depth = 2 + +# How long before timing out a DagFileProcessor, which processes a dag file +dag_file_processor_timeout = 50 + +# The class to use for running task instances in a subprocess. +# Choices include StandardTaskRunner, CgroupTaskRunner or the full import path to the class +# when using a custom task runner. +task_runner = StandardTaskRunner + +# If set, tasks without a ``run_as_user`` argument will be run with this user +# Can be used to de-elevate a sudo user running Airflow when executing tasks +default_impersonation = + +# What security module to use (for example kerberos) +security = + +# Turn unit test mode on (overwrites many configuration options with test +# values at runtime) +unit_test_mode = False + +# Whether to enable pickling for xcom (note that this is insecure and allows for +# RCE exploits). +enable_xcom_pickling = False + +# When a task is killed forcefully, this is the amount of time in seconds that +# it has to cleanup after it is sent a SIGTERM, before it is SIGKILLED +killed_task_cleanup_time = 60 + +# Whether to override params with dag_run.conf. If you pass some key-value pairs +# through ``airflow dags backfill -c`` or +# ``airflow dags trigger -c``, the key-value pairs will override the existing ones in params. +dag_run_conf_overrides_params = True + +# When discovering DAGs, ignore any files that don't contain the strings ``DAG`` and ``airflow``. +dag_discovery_safe_mode = True + +# The number of retries each task is going to have by default. Can be overridden at dag or task level. +default_task_retries = 0 + +# The weighting method used for the effective total priority weight of the task +default_task_weight_rule = downstream + +# Updating serialized DAG can not be faster than a minimum interval to reduce database write rate. +min_serialized_dag_update_interval = 30 + +# Fetching serialized DAG can not be faster than a minimum interval to reduce database +# read rate. This config controls when your DAGs are updated in the Webserver +min_serialized_dag_fetch_interval = 10 + +# Maximum number of Rendered Task Instance Fields (Template Fields) per task to store +# in the Database. +# All the template_fields for each of Task Instance are stored in the Database. +# Keeping this number small may cause an error when you try to view ``Rendered`` tab in +# TaskInstance view for older tasks. +max_num_rendered_ti_fields_per_task = 30 + +# On each dagrun check against defined SLAs +check_slas = True + +# Path to custom XCom class that will be used to store and resolve operators results +# Example: xcom_backend = path.to.CustomXCom +xcom_backend = airflow.models.xcom.BaseXCom + +# By default Airflow plugins are lazily-loaded (only loaded when required). Set it to ``False``, +# if you want to load plugins whenever 'airflow' is invoked via cli or loaded from module. +lazy_load_plugins = True + +# By default Airflow providers are lazily-discovered (discovery and imports happen only when required). +# Set it to False, if you want to discover providers whenever 'airflow' is invoked via cli or +# loaded from module. +lazy_discover_providers = True + +# Number of times the code should be retried in case of DB Operational Errors. +# Not all transactions will be retried as it can cause undesired state. +# Currently it is only used in ``DagFileProcessor.process_file`` to retry ``dagbag.sync_to_db``. +max_db_retries = 3 + +# Hide sensitive Variables or Connection extra json keys from UI and task logs when set to True +# +# (Connection passwords are always hidden in logs) +hide_sensitive_var_conn_fields = True + +# A comma-separated list of extra sensitive keywords to look for in variables names or connection's +# extra JSON. +sensitive_var_conn_names = + +# Task Slot counts for ``default_pool``. This setting would not have any effect in an existing +# deployment where the ``default_pool`` is already created. For existing deployments, users can +# change the number of slots using Webserver, API or the CLI +default_pool_task_slot_count = 128 + +[logging] +# The folder where airflow should store its log files. +# This path must be absolute. +# There are a few existing configurations that assume this is set to the default. +# If you choose to override this you may need to update the dag_processor_manager_log_location and +# dag_processor_manager_log_location settings as well. +base_log_folder = /opt/airflow/logs + +# Airflow can store logs remotely in AWS S3, Google Cloud Storage or Elastic Search. +# Set this to True if you want to enable remote logging. +remote_logging = False + +# Users must supply an Airflow connection id that provides access to the storage +# location. +remote_log_conn_id = + +# Path to Google Credential JSON file. If omitted, authorization based on `the Application Default +# Credentials +# `__ will +# be used. +google_key_path = + +# Storage bucket URL for remote logging +# S3 buckets should start with "s3://" +# Cloudwatch log groups should start with "cloudwatch://" +# GCS buckets should start with "gs://" +# WASB buckets should start with "wasb" just to help Airflow select correct handler +# Stackdriver logs should start with "stackdriver://" +remote_base_log_folder = + +# Use server-side encryption for logs stored in S3 +encrypt_s3_logs = False + +# Logging level. +# +# Supported values: ``CRITICAL``, ``ERROR``, ``WARNING``, ``INFO``, ``DEBUG``. +logging_level = INFO + +# Logging level for Flask-appbuilder UI. +# +# Supported values: ``CRITICAL``, ``ERROR``, ``WARNING``, ``INFO``, ``DEBUG``. +fab_logging_level = WARNING + +# Logging class +# Specify the class that will specify the logging configuration +# This class has to be on the python classpath +# Example: logging_config_class = my.path.default_local_settings.LOGGING_CONFIG +logging_config_class = + +# Flag to enable/disable Colored logs in Console +# Colour the logs when the controlling terminal is a TTY. +colored_console_log = True + +# Log format for when Colored logs is enabled +colored_log_format = [%%(blue)s%%(asctime)s%%(reset)s] {%%(blue)s%%(filename)s:%%(reset)s%%(lineno)d} %%(log_color)s%%(levelname)s%%(reset)s - %%(log_color)s%%(message)s%%(reset)s +colored_formatter_class = airflow.utils.log.colored_log.CustomTTYColoredFormatter + +# Format of Log line +log_format = [%%(asctime)s] {%%(filename)s:%%(lineno)d} %%(levelname)s - %%(message)s +simple_log_format = %%(asctime)s %%(levelname)s - %%(message)s + +# Specify prefix pattern like mentioned below with stream handler TaskHandlerWithCustomFormatter +# Example: task_log_prefix_template = {ti.dag_id}-{ti.task_id}-{execution_date}-{try_number} +task_log_prefix_template = + +# Formatting for how airflow generates file names/paths for each task run. +log_filename_template = {{ ti.dag_id }}/{{ ti.task_id }}/{{ ts }}/{{ try_number }}.log + +# Formatting for how airflow generates file names for log +log_processor_filename_template = {{ filename }}.log + +# Full path of dag_processor_manager logfile. +dag_processor_manager_log_location = /opt/airflow/logs/dag_processor_manager/dag_processor_manager.log + +# Name of handler to read task instance logs. +# Defaults to use ``task`` handler. +task_log_reader = task + +# A comma\-separated list of third-party logger names that will be configured to print messages to +# consoles\. +# Example: extra_logger_names = connexion,sqlalchemy +extra_logger_names = + +# When you start an airflow worker, airflow starts a tiny web server +# subprocess to serve the workers local log files to the airflow main +# web server, who then builds pages and sends them to users. This defines +# the port on which the logs are served. It needs to be unused, and open +# visible from the main web server to connect into the workers. +worker_log_server_port = 8793 + +[metrics] + +# StatsD (https://github.com/etsy/statsd) integration settings. +# Enables sending metrics to StatsD. +statsd_on = False +statsd_host = localhost +statsd_port = 8125 +statsd_prefix = airflow + +# If you want to avoid sending all the available metrics to StatsD, +# you can configure an allow list of prefixes (comma separated) to send only the metrics that +# start with the elements of the list (e.g: "scheduler,executor,dagrun") +statsd_allow_list = + +# A function that validate the statsd stat name, apply changes to the stat name if necessary and return +# the transformed stat name. +# +# The function should have the following signature: +# def func_name(stat_name: str) -> str: +stat_name_handler = + +# To enable datadog integration to send airflow metrics. +statsd_datadog_enabled = False + +# List of datadog tags attached to all metrics(e.g: key1:value1,key2:value2) +statsd_datadog_tags = + +# If you want to utilise your own custom Statsd client set the relevant +# module path below. +# Note: The module path must exist on your PYTHONPATH for Airflow to pick it up +# statsd_custom_client_path = + +[secrets] +# Full class name of secrets backend to enable (will precede env vars and metastore in search path) +# Example: backend = airflow.providers.amazon.aws.secrets.systems_manager.SystemsManagerParameterStoreBackend +backend = + +# The backend_kwargs param is loaded into a dictionary and passed to __init__ of secrets backend class. +# See documentation for the secrets backend you are using. JSON is expected. +# Example for AWS Systems Manager ParameterStore: +# ``{"connections_prefix": "/airflow/connections", "profile_name": "default"}`` +backend_kwargs = + +[cli] +# In what way should the cli access the API. The LocalClient will use the +# database directly, while the json_client will use the api running on the +# webserver +api_client = airflow.api.client.local_client + +# If you set web_server_url_prefix, do NOT forget to append it here, ex: +# ``endpoint_url = http://localhost:8080/myroot`` +# So api will look like: ``http://localhost:8080/myroot/api/experimental/...`` +endpoint_url = http://localhost:8080 + +[debug] +# Used only with ``DebugExecutor``. If set to ``True`` DAG will fail with first +# failed task. Helpful for debugging purposes. +fail_fast = False + +[api] +# Enables the deprecated experimental API. Please note that these APIs do not have access control. +# The authenticated user has full access. +# +# .. warning:: +# +# This `Experimental REST API `__ is +# deprecated since version 2.0. Please consider using +# `the Stable REST API `__. +# For more information on migration, see +# `UPDATING.md `_ +enable_experimental_api = False + +# How to authenticate users of the API. See +# https://airflow.apache.org/docs/apache-airflow/stable/security.html for possible values. +# ("airflow.api.auth.backend.default" allows all requests for historic reasons) +auth_backend = airflow.api.auth.backend.deny_all + +# Used to set the maximum page limit for API requests +maximum_page_limit = 100 + +# Used to set the default page limit when limit is zero. A default limit +# of 100 is set on OpenApi spec. However, this particular default limit +# only work when limit is set equal to zero(0) from API requests. +# If no limit is supplied, the OpenApi spec default is used. +fallback_page_limit = 100 + +# The intended audience for JWT token credentials used for authorization. This value must match on the client and server sides. If empty, audience will not be tested. +# Example: google_oauth2_audience = project-id-random-value.apps.googleusercontent.com +google_oauth2_audience = + +# Path to Google Cloud Service Account key file (JSON). If omitted, authorization based on +# `the Application Default Credentials +# `__ will +# be used. +# Example: google_key_path = /files/service-account-json +google_key_path = + +# Used in response to a preflight request to indicate which HTTP +# headers can be used when making the actual request. This header is +# the server side response to the browser's +# Access-Control-Request-Headers header. +access_control_allow_headers = + +# Specifies the method or methods allowed when accessing the resource. +access_control_allow_methods = + +# Indicates whether the response can be shared with requesting code from the given origins. +# Separate URLs with space. +access_control_allow_origins = + +[lineage] +# what lineage backend to use +backend = + +[atlas] +sasl_enabled = False +host = +port = 21000 +username = +password = + +[operators] +# The default owner assigned to each new operator, unless +# provided explicitly or passed via ``default_args`` +default_owner = airflow +default_cpus = 1 +default_ram = 512 +default_disk = 512 +default_gpus = 0 + +# Default queue that tasks get assigned to and that worker listen on. +default_queue = default + +# Is allowed to pass additional/unused arguments (args, kwargs) to the BaseOperator operator. +# If set to False, an exception will be thrown, otherwise only the console message will be displayed. +allow_illegal_arguments = False + +[hive] +# Default mapreduce queue for HiveOperator tasks +default_hive_mapred_queue = + +# Template for mapred_job_name in HiveOperator, supports the following named parameters +# hostname, dag_id, task_id, execution_date +# mapred_job_name_template = + +[webserver] +# The base url of your website as airflow cannot guess what domain or +# cname you are using. This is used in automated emails that +# airflow sends to point links to the right web server +base_url = http://localhost:8080 + +# Default timezone to display all dates in the UI, can be UTC, system, or +# any IANA timezone string (e.g. Europe/Amsterdam). If left empty the +# default value of core/default_timezone will be used +# Example: default_ui_timezone = America/New_York +default_ui_timezone = UTC + +# The ip specified when starting the web server +web_server_host = 0.0.0.0 + +# The port on which to run the web server +web_server_port = 8080 + +# Paths to the SSL certificate and key for the web server. When both are +# provided SSL will be enabled. This does not change the web server port. +web_server_ssl_cert = + +# Paths to the SSL certificate and key for the web server. When both are +# provided SSL will be enabled. This does not change the web server port. +web_server_ssl_key = + +# The type of backend used to store web session data, can be 'database' or 'securecookie' +# Example: session_backend = securecookie +session_backend = database + +# Number of seconds the webserver waits before killing gunicorn master that doesn't respond +web_server_master_timeout = 120 + +# Number of seconds the gunicorn webserver waits before timing out on a worker +web_server_worker_timeout = 120 + +# Number of workers to refresh at a time. When set to 0, worker refresh is +# disabled. When nonzero, airflow periodically refreshes webserver workers by +# bringing up new ones and killing old ones. +worker_refresh_batch_size = 1 + +# Number of seconds to wait before refreshing a batch of workers. +worker_refresh_interval = 6000 + +# If set to True, Airflow will track files in plugins_folder directory. When it detects changes, +# then reload the gunicorn. +reload_on_plugin_change = False + +# Secret key used to run your flask app. It should be as random as possible. However, when running +# more than 1 instances of webserver, make sure all of them use the same ``secret_key`` otherwise +# one of them will error with "CSRF session token is missing". +secret_key = g/rHkt7pPrfeHOlAWr5EaQ== + +# Number of workers to run the Gunicorn web server +workers = 4 + +# The worker class gunicorn should use. Choices include +# sync (default), eventlet, gevent +worker_class = sync + +# Log files for the gunicorn webserver. '-' means log to stderr. +access_logfile = - + +# Log files for the gunicorn webserver. '-' means log to stderr. +error_logfile = - + +# Access log format for gunicorn webserver. +# default format is %%(h)s %%(l)s %%(u)s %%(t)s "%%(r)s" %%(s)s %%(b)s "%%(f)s" "%%(a)s" +# documentation - https://docs.gunicorn.org/en/stable/settings.html#access-log-format +access_logformat = + +# Expose the configuration file in the web server +expose_config = False + +# Expose hostname in the web server +expose_hostname = True + +# Expose stacktrace in the web server +expose_stacktrace = True + +# Default DAG view. Valid values are: ``tree``, ``graph``, ``duration``, ``gantt``, ``landing_times`` +dag_default_view = tree + +# Default DAG orientation. Valid values are: +# ``LR`` (Left->Right), ``TB`` (Top->Bottom), ``RL`` (Right->Left), ``BT`` (Bottom->Top) +dag_orientation = LR + +# The amount of time (in secs) webserver will wait for initial handshake +# while fetching logs from other worker machine +log_fetch_timeout_sec = 5 + +# Time interval (in secs) to wait before next log fetching. +log_fetch_delay_sec = 2 + +# Distance away from page bottom to enable auto tailing. +log_auto_tailing_offset = 30 + +# Animation speed for auto tailing log display. +log_animation_speed = 1000 + +# By default, the webserver shows paused DAGs. Flip this to hide paused +# DAGs by default +hide_paused_dags_by_default = False + +# Consistent page size across all listing views in the UI +page_size = 100 + +# Define the color of navigation bar +navbar_color = #fff + +# Default dagrun to show in UI +default_dag_run_display_number = 25 + +# Enable werkzeug ``ProxyFix`` middleware for reverse proxy +enable_proxy_fix = False + +# Number of values to trust for ``X-Forwarded-For``. +# More info: https://werkzeug.palletsprojects.com/en/0.16.x/middleware/proxy_fix/ +proxy_fix_x_for = 1 + +# Number of values to trust for ``X-Forwarded-Proto`` +proxy_fix_x_proto = 1 + +# Number of values to trust for ``X-Forwarded-Host`` +proxy_fix_x_host = 1 + +# Number of values to trust for ``X-Forwarded-Port`` +proxy_fix_x_port = 1 + +# Number of values to trust for ``X-Forwarded-Prefix`` +proxy_fix_x_prefix = 1 + +# Set secure flag on session cookie +cookie_secure = False + +# Set samesite policy on session cookie +cookie_samesite = Lax + +# Default setting for wrap toggle on DAG code and TI log views. +default_wrap = False + +# Allow the UI to be rendered in a frame +x_frame_enabled = True + +# Send anonymous user activity to your analytics tool +# choose from google_analytics, segment, or metarouter +# analytics_tool = + +# Unique ID of your account in the analytics tool +# analytics_id = + +# 'Recent Tasks' stats will show for old DagRuns if set +show_recent_stats_for_completed_runs = True + +# Update FAB permissions and sync security manager roles +# on webserver startup +update_fab_perms = True + +# The UI cookie lifetime in minutes. User will be logged out from UI after +# ``session_lifetime_minutes`` of non-activity +session_lifetime_minutes = 43200 + +# Sets a custom page title for the DAGs overview page and site title for all pages +# instance_name = + +# How frequently, in seconds, the DAG data will auto-refresh in graph or tree view +# when auto-refresh is turned on +auto_refresh_interval = 3 + +[email] + +# Configuration email backend and whether to +# send email alerts on retry or failure +# Email backend to use +email_backend = airflow.utils.email.send_email_smtp + +# Email connection to use +email_conn_id = smtp_default + +# Whether email alerts should be sent when a task is retried +default_email_on_retry = True + +# Whether email alerts should be sent when a task failed +default_email_on_failure = True + +# File that will be used as the template for Email subject (which will be rendered using Jinja2). +# If not set, Airflow uses a base template. +# Example: subject_template = /path/to/my_subject_template_file +# subject_template = + +# File that will be used as the template for Email content (which will be rendered using Jinja2). +# If not set, Airflow uses a base template. +# Example: html_content_template = /path/to/my_html_content_template_file +# html_content_template = + +# Email address that will be used as sender address. +# It can either be raw email or the complete address in a format ``Sender Name `` +# Example: from_email = Airflow +# from_email = + +[smtp] + +# If you want airflow to send emails on retries, failure, and you want to use +# the airflow.utils.email.send_email_smtp function, you have to configure an +# smtp server here +smtp_host = localhost +smtp_starttls = True +smtp_ssl = False +# Example: smtp_user = airflow +# smtp_user = +# Example: smtp_password = airflow +# smtp_password = +smtp_port = 25 +smtp_mail_from = airflow@example.com +smtp_timeout = 30 +smtp_retry_limit = 5 + +[sentry] + +# Sentry (https://docs.sentry.io) integration. Here you can supply +# additional configuration options based on the Python platform. See: +# https://docs.sentry.io/error-reporting/configuration/?platform=python. +# Unsupported options: ``integrations``, ``in_app_include``, ``in_app_exclude``, +# ``ignore_errors``, ``before_breadcrumb``, ``transport``. +# Enable error reporting to Sentry +sentry_on = false +sentry_dsn = + +# Dotted path to a before_send function that the sentry SDK should be configured to use. +# before_send = + +[celery_kubernetes_executor] + +# This section only applies if you are using the ``CeleryKubernetesExecutor`` in +# ``[core]`` section above +# Define when to send a task to ``KubernetesExecutor`` when using ``CeleryKubernetesExecutor``. +# When the queue of a task is the value of ``kubernetes_queue`` (default ``kubernetes``), +# the task is executed via ``KubernetesExecutor``, +# otherwise via ``CeleryExecutor`` +kubernetes_queue = kubernetes + +[celery] + +# This section only applies if you are using the CeleryExecutor in +# ``[core]`` section above +# The app name that will be used by celery +celery_app_name = airflow.executors.celery_executor + +# The concurrency that will be used when starting workers with the +# ``airflow celery worker`` command. This defines the number of task instances that +# a worker will take, so size up your workers based on the resources on +# your worker box and the nature of your tasks +worker_concurrency = 16 + +# The maximum and minimum concurrency that will be used when starting workers with the +# ``airflow celery worker`` command (always keep minimum processes, but grow +# to maximum if necessary). Note the value should be max_concurrency,min_concurrency +# Pick these numbers based on resources on worker box and the nature of the task. +# If autoscale option is available, worker_concurrency will be ignored. +# http://docs.celeryproject.org/en/latest/reference/celery.bin.worker.html#cmdoption-celery-worker-autoscale +# Example: worker_autoscale = 16,12 +# worker_autoscale = + +# Used to increase the number of tasks that a worker prefetches which can improve performance. +# The number of processes multiplied by worker_prefetch_multiplier is the number of tasks +# that are prefetched by a worker. A value greater than 1 can result in tasks being unnecessarily +# blocked if there are multiple workers and one worker prefetches tasks that sit behind long +# running tasks while another worker has unutilized processes that are unable to process the already +# claimed blocked tasks. +# https://docs.celeryproject.org/en/stable/userguide/optimizing.html#prefetch-limits +# Example: worker_prefetch_multiplier = 1 +# worker_prefetch_multiplier = + +# Umask that will be used when starting workers with the ``airflow celery worker`` +# in daemon mode. This control the file-creation mode mask which determines the initial +# value of file permission bits for newly created files. +worker_umask = 0o077 + +# The Celery broker URL. Celery supports RabbitMQ, Redis and experimentally +# a sqlalchemy database. Refer to the Celery documentation for more information. +broker_url = redis://redis:6379/0 + +# The Celery result_backend. When a job finishes, it needs to update the +# metadata of the job. Therefore it will post a message on a message bus, +# or insert it into a database (depending of the backend) +# This status is used by the scheduler to update the state of the task +# The use of a database is highly recommended +# http://docs.celeryproject.org/en/latest/userguide/configuration.html#task-result-backend-settings +result_backend = db+postgresql://postgres:airflow@postgres/airflow + +# Celery Flower is a sweet UI for Celery. Airflow has a shortcut to start +# it ``airflow celery flower``. This defines the IP that Celery Flower runs on +flower_host = 0.0.0.0 + +# The root URL for Flower +# Example: flower_url_prefix = /flower +flower_url_prefix = + +# This defines the port that Celery Flower runs on +flower_port = 5555 + +# Securing Flower with Basic Authentication +# Accepts user:password pairs separated by a comma +# Example: flower_basic_auth = user1:password1,user2:password2 +flower_basic_auth = + +# How many processes CeleryExecutor uses to sync task state. +# 0 means to use max(1, number of cores - 1) processes. +sync_parallelism = 0 + +# Import path for celery configuration options +celery_config_options = airflow.config_templates.default_celery.DEFAULT_CELERY_CONFIG +ssl_active = False +ssl_key = +ssl_cert = +ssl_cacert = + +# Celery Pool implementation. +# Choices include: ``prefork`` (default), ``eventlet``, ``gevent`` or ``solo``. +# See: +# https://docs.celeryproject.org/en/latest/userguide/workers.html#concurrency +# https://docs.celeryproject.org/en/latest/userguide/concurrency/eventlet.html +pool = prefork + +# The number of seconds to wait before timing out ``send_task_to_executor`` or +# ``fetch_celery_task_state`` operations. +operation_timeout = 1.0 + +# Celery task will report its status as 'started' when the task is executed by a worker. +# This is used in Airflow to keep track of the running tasks and if a Scheduler is restarted +# or run in HA mode, it can adopt the orphan tasks launched by previous SchedulerJob. +task_track_started = True + +# Time in seconds after which Adopted tasks are cleared by CeleryExecutor. This is helpful to clear +# stalled tasks. +task_adoption_timeout = 600 + +# The Maximum number of retries for publishing task messages to the broker when failing +# due to ``AirflowTaskTimeout`` error before giving up and marking Task as failed. +task_publish_max_retries = 3 + +# Worker initialisation check to validate Metadata Database connection +worker_precheck = False + +[celery_broker_transport_options] + +# This section is for specifying options which can be passed to the +# underlying celery broker transport. See: +# http://docs.celeryproject.org/en/latest/userguide/configuration.html#std:setting-broker_transport_options +# The visibility timeout defines the number of seconds to wait for the worker +# to acknowledge the task before the message is redelivered to another worker. +# Make sure to increase the visibility timeout to match the time of the longest +# ETA you're planning to use. +# visibility_timeout is only supported for Redis and SQS celery brokers. +# See: +# http://docs.celeryproject.org/en/master/userguide/configuration.html#std:setting-broker_transport_options +# Example: visibility_timeout = 21600 +# visibility_timeout = + +[dask] + +# This section only applies if you are using the DaskExecutor in +# [core] section above +# The IP address and port of the Dask cluster's scheduler. +cluster_address = 127.0.0.1:8786 + +# TLS/ SSL settings to access a secured Dask scheduler. +tls_ca = +tls_cert = +tls_key = + +[scheduler] +# Task instances listen for external kill signal (when you clear tasks +# from the CLI or the UI), this defines the frequency at which they should +# listen (in seconds). +job_heartbeat_sec = 5 + +# The scheduler constantly tries to trigger new tasks (look at the +# scheduler section in the docs for more information). This defines +# how often the scheduler should run (in seconds). +scheduler_heartbeat_sec = 5 + +# The number of times to try to schedule each DAG file +# -1 indicates unlimited number +num_runs = -1 + +# Controls how long the scheduler will sleep between loops, but if there was nothing to do +# in the loop. i.e. if it scheduled something then it will start the next loop +# iteration straight away. +scheduler_idle_sleep_time = 1 + +# Number of seconds after which a DAG file is parsed. The DAG file is parsed every +# ``min_file_process_interval`` number of seconds. Updates to DAGs are reflected after +# this interval. Keeping this number low will increase CPU usage. +min_file_process_interval = 30 + +# How often (in seconds) to check for stale DAGs (DAGs which are no longer present in +# the expected files) which should be deactivated. +deactivate_stale_dags_interval = 60 + +# How often (in seconds) to scan the DAGs directory for new files. Default to 5 minutes. +dag_dir_list_interval = 300 + +# How often should stats be printed to the logs. Setting to 0 will disable printing stats +print_stats_interval = 30 + +# How often (in seconds) should pool usage stats be sent to statsd (if statsd_on is enabled) +pool_metrics_interval = 5.0 + +# If the last scheduler heartbeat happened more than scheduler_health_check_threshold +# ago (in seconds), scheduler is considered unhealthy. +# This is used by the health check in the "/health" endpoint +scheduler_health_check_threshold = 30 + +# How often (in seconds) should the scheduler check for orphaned tasks and SchedulerJobs +orphaned_tasks_check_interval = 300.0 +child_process_log_directory = /opt/airflow/logs/scheduler + +# Local task jobs periodically heartbeat to the DB. If the job has +# not heartbeat in this many seconds, the scheduler will mark the +# associated task instance as failed and will re-schedule the task. +scheduler_zombie_task_threshold = 300 + +# Turn off scheduler catchup by setting this to ``False``. +# Default behavior is unchanged and +# Command Line Backfills still work, but the scheduler +# will not do scheduler catchup if this is ``False``, +# however it can be set on a per DAG basis in the +# DAG definition (catchup) +catchup_by_default = True + +# This changes the batch size of queries in the scheduling main loop. +# If this is too high, SQL query performance may be impacted by +# complexity of query predicate, and/or excessive locking. +# Additionally, you may hit the maximum allowable query length for your db. +# Set this to 0 for no limit (not advised) +max_tis_per_query = 512 + +# Should the scheduler issue ``SELECT ... FOR UPDATE`` in relevant queries. +# If this is set to False then you should not run more than a single +# scheduler at once +use_row_level_locking = True + +# Max number of DAGs to create DagRuns for per scheduler loop. +max_dagruns_to_create_per_loop = 10 + +# How many DagRuns should a scheduler examine (and lock) when scheduling +# and queuing tasks. +max_dagruns_per_loop_to_schedule = 20 + +# Should the Task supervisor process perform a "mini scheduler" to attempt to schedule more tasks of the +# same DAG. Leaving this on will mean tasks in the same DAG execute quicker, but might starve out other +# dags in some circumstances +schedule_after_task_execution = True + +# The scheduler can run multiple processes in parallel to parse dags. +# This defines how many processes will run. +parsing_processes = 2 + +# One of ``modified_time``, ``random_seeded_by_host`` and ``alphabetical``. +# The scheduler will list and sort the dag files to decide the parsing order. +# +# * ``modified_time``: Sort by modified time of the files. This is useful on large scale to parse the +# recently modified DAGs first. +# * ``random_seeded_by_host``: Sort randomly across multiple Schedulers but with same order on the +# same host. This is useful when running with Scheduler in HA mode where each scheduler can +# parse different DAG files. +# * ``alphabetical``: Sort by filename +file_parsing_sort_mode = modified_time + +# Turn off scheduler use of cron intervals by setting this to False. +# DAGs submitted manually in the web UI or with trigger_dag will still run. +use_job_schedule = True + +# Allow externally triggered DagRuns for Execution Dates in the future +# Only has effect if schedule_interval is set to None in DAG +allow_trigger_in_future = False + +# DAG dependency detector class to use +dependency_detector = airflow.serialization.serialized_objects.DependencyDetector + +# How often to check for expired trigger requests that have not run yet. +trigger_timeout_check_interval = 15 + +[triggerer] +# How many triggers a single Triggerer will run at once, by default. +default_capacity = 1000 + +[kerberos] +ccache = /tmp/airflow_krb5_ccache + +# gets augmented with fqdn +principal = airflow +reinit_frequency = 3600 +kinit_path = kinit +keytab = airflow.keytab + +# Allow to disable ticket forwardability. +forwardable = True + +# Allow to remove source IP from token, useful when using token behind NATted Docker host. +include_ip = True + +[github_enterprise] +api_rev = v3 + +[elasticsearch] +# Elasticsearch host +host = + +# Format of the log_id, which is used to query for a given tasks logs +log_id_template = {dag_id}-{task_id}-{execution_date}-{try_number} + +# Used to mark the end of a log stream for a task +end_of_log_mark = end_of_log + +# Qualified URL for an elasticsearch frontend (like Kibana) with a template argument for log_id +# Code will construct log_id using the log_id template from the argument above. +# NOTE: scheme will default to https if one is not provided +# Example: frontend = http://localhost:5601/app/kibana#/discover?_a=(columns:![message),query:(language:kuery,query:'log_id: "{log_id}"'),sort:![log.offset,asc)) +frontend = + +# Write the task logs to the stdout of the worker, rather than the default files +write_stdout = False + +# Instead of the default log formatter, write the log lines as JSON +json_format = False + +# Log fields to also attach to the json output, if enabled +json_fields = asctime, filename, lineno, levelname, message + +# The field where host name is stored (normally either `host` or `host.name`) +host_field = host + +# The field where offset is stored (normally either `offset` or `log.offset`) +offset_field = offset + +[elasticsearch_configs] +use_ssl = False +verify_certs = True + +[kubernetes] +# Path to the YAML pod file that forms the basis for KubernetesExecutor workers. +pod_template_file = + +# The repository of the Kubernetes Image for the Worker to Run +worker_container_repository = + +# The tag of the Kubernetes Image for the Worker to Run +worker_container_tag = + +# The Kubernetes namespace where airflow workers should be created. Defaults to ``default`` +namespace = default + +# If True, all worker pods will be deleted upon termination +delete_worker_pods = True + +# If False (and delete_worker_pods is True), +# failed worker pods will not be deleted so users can investigate them. +# This only prevents removal of worker pods where the worker itself failed, +# not when the task it ran failed. +delete_worker_pods_on_failure = False + +# Number of Kubernetes Worker Pod creation calls per scheduler loop. +# Note that the current default of "1" will only launch a single pod +# per-heartbeat. It is HIGHLY recommended that users increase this +# number to match the tolerance of their kubernetes cluster for +# better performance. +worker_pods_creation_batch_size = 1 + +# Allows users to launch pods in multiple namespaces. +# Will require creating a cluster-role for the scheduler +multi_namespace_mode = False + +# Use the service account kubernetes gives to pods to connect to kubernetes cluster. +# It's intended for clients that expect to be running inside a pod running on kubernetes. +# It will raise an exception if called from a process not running in a kubernetes environment. +in_cluster = True + +# When running with in_cluster=False change the default cluster_context or config_file +# options to Kubernetes client. Leave blank these to use default behaviour like ``kubectl`` has. +# cluster_context = + +# Path to the kubernetes configfile to be used when ``in_cluster`` is set to False +# config_file = + +# Keyword parameters to pass while calling a kubernetes client core_v1_api methods +# from Kubernetes Executor provided as a single line formatted JSON dictionary string. +# List of supported params are similar for all core_v1_apis, hence a single config +# variable for all apis. See: +# https://raw.githubusercontent.com/kubernetes-client/python/41f11a09995efcd0142e25946adc7591431bfb2f/kubernetes/client/api/core_v1_api.py +kube_client_request_args = + +# Optional keyword arguments to pass to the ``delete_namespaced_pod`` kubernetes client +# ``core_v1_api`` method when using the Kubernetes Executor. +# This should be an object and can contain any of the options listed in the ``v1DeleteOptions`` +# class defined here: +# https://github.com/kubernetes-client/python/blob/41f11a09995efcd0142e25946adc7591431bfb2f/kubernetes/client/models/v1_delete_options.py#L19 +# Example: delete_option_kwargs = {"grace_period_seconds": 10} +delete_option_kwargs = + +# Enables TCP keepalive mechanism. This prevents Kubernetes API requests to hang indefinitely +# when idle connection is time-outed on services like cloud load balancers or firewalls. +enable_tcp_keepalive = True + +# When the `enable_tcp_keepalive` option is enabled, TCP probes a connection that has +# been idle for `tcp_keep_idle` seconds. +tcp_keep_idle = 120 + +# When the `enable_tcp_keepalive` option is enabled, if Kubernetes API does not respond +# to a keepalive probe, TCP retransmits the probe after `tcp_keep_intvl` seconds. +tcp_keep_intvl = 30 + +# When the `enable_tcp_keepalive` option is enabled, if Kubernetes API does not respond +# to a keepalive probe, TCP retransmits the probe `tcp_keep_cnt number` of times before +# a connection is considered to be broken. +tcp_keep_cnt = 6 + +# Set this to false to skip verifying SSL certificate of Kubernetes python client. +verify_ssl = True + +# How long in seconds a worker can be in Pending before it is considered a failure +worker_pods_pending_timeout = 300 + +# How often in seconds to check if Pending workers have exceeded their timeouts +worker_pods_pending_timeout_check_interval = 120 + +# How often in seconds to check for task instances stuck in "queued" status without a pod +worker_pods_queued_check_interval = 60 + +# How many pending pods to check for timeout violations in each check interval. +# You may want this higher if you have a very large cluster and/or use ``multi_namespace_mode``. +worker_pods_pending_timeout_batch_size = 100 + +[smart_sensor] +# When `use_smart_sensor` is True, Airflow redirects multiple qualified sensor tasks to +# smart sensor task. +use_smart_sensor = False + +# `shard_code_upper_limit` is the upper limit of `shard_code` value. The `shard_code` is generated +# by `hashcode % shard_code_upper_limit`. +shard_code_upper_limit = 10000 + +# The number of running smart sensor processes for each service. +shards = 5 + +# comma separated sensor classes support in smart_sensor. +sensors_enabled = NamedHivePartitionSensor + +# Igor comment diff --git a/quickstarts/other-integrations/attachments/execute-airflow-workflows-that-use-dbt-with-teradata-vantage/airflow_dbt_integration.py b/quickstarts/other-integrations/attachments/execute-airflow-workflows-that-use-dbt-with-teradata-vantage/airflow_dbt_integration.py new file mode 100644 index 0000000000..1f6fd5328e --- /dev/null +++ b/quickstarts/other-integrations/attachments/execute-airflow-workflows-that-use-dbt-with-teradata-vantage/airflow_dbt_integration.py @@ -0,0 +1,47 @@ +from airflow import DAG +from airflow.operators.python import PythonOperator, BranchPythonOperator +from airflow.operators.bash import BashOperator +from airflow.operators.dummy_operator import DummyOperator +from datetime import datetime + + +default_args = { + 'owner': 'airflow', + 'depends_on_past': False, + 'start_date': datetime(2020,8,1), + 'retries': 0 +} + +with DAG('airflow_dbt_integration', default_args=default_args, schedule_interval='@once') as dag: + task_1 = BashOperator( + task_id='dbt_debug', + bash_command='cd /opt/airflow && rm -f logs/dbt.log && dbt debug', + dag=dag + ) + + task_2 = BashOperator( + task_id='dbt_seed', + bash_command='cd /opt/airflow && dbt seed', + dag=dag + ) + + task_3 = BashOperator( + task_id='dbt_run', + bash_command='cd /opt/airflow && dbt run', + dag=dag + ) + + task_4 = BashOperator( + task_id='dbt_test', + bash_command='cd /opt/airflow && dbt test', + dag=dag + ) + + task_5 = BashOperator( + task_id='dbt_docs_generate', + bash_command='cd /opt/airflow && dbt docs generate', + dag=dag + ) + + + task_1 >> task_2 >> task_3 >> task_4 >> task_5 # Define dependencies diff --git a/quickstarts/other-integrations/attachments/execute-airflow-workflows-that-use-dbt-with-teradata-vantage/db_test_example_dag.py b/quickstarts/other-integrations/attachments/execute-airflow-workflows-that-use-dbt-with-teradata-vantage/db_test_example_dag.py new file mode 100644 index 0000000000..7335141e13 --- /dev/null +++ b/quickstarts/other-integrations/attachments/execute-airflow-workflows-that-use-dbt-with-teradata-vantage/db_test_example_dag.py @@ -0,0 +1,152 @@ +from datetime import datetime, timedelta +from airflow import DAG +from airflow.models import Variable +from airflow.operators.python_operator import PythonOperator, BranchPythonOperator +from airflow.operators.bash_operator import BashOperator +import pendulum +import teradatasql +import logging +import getpass +import urllib.parse +import sqlalchemy +from sqlalchemy import exc +from sqlalchemy.dialects import registry + + +db_user = 'airflowtest' +db_password = 'abcd' +db_IP_address = '44.236.48.243' +SQL_string_cleanup = 'drop table employee;drop table organization' + +SQL_string_create_employee = 'create table employee (employee_id integer, name varchar(40), emp_position varchar(40), salary integer, organization_id integer);insert into employee (1,\'John Smith\',\'Engineer\',80000,1);insert into employee (2,\'Jennifer Jones\',\'Account Manager\',100000,2);insert into employee (3,\'William Bowman\',\'Product Manager\',90000,3);insert into employee (1,\'Meghan Stein\',\'Project Manager\',75000,1);' + +SQL_string_create_organization = 'create table organization (organization_id integer, organization_name varchar(40), organization_status varchar(10)); insert into organization (1,\'Engineering\',\'Active\');insert into organization (2,\'Sales\',\'Active\');insert into organization (3,\'Marketing\',\'Active\');insert into organization (4,\'Engineering-Old\',\'Inactive\')' + +SQL_string_select = 'select avg(employee.salary),organization.organization_name from employee,organization where employee.organization_id=organization.organization_id and organization.organization_status=\'Active\' group by organization.organization_name' + + + +#Execute an SQL statements in a string format; The string can contain one or more SQL commands separated by ";" + +def executeSQLString(db_user, db_password, db_IP_address, SQL_string): + + + + # all SQL commands (split by ';') + sqlCommands = SQL_string.split(';') + + # create database connection + try: + registry.register("teradatasql", "teradatasqlalchemy.dialect", "TeradataDialect") + enginedbc = sqlalchemy.create_engine('teradatasql://'+db_IP_address+'/?user='+db_user+'&password='+db_password, connect_args={'sslmode': "DISABLE"}) + conn = enginedbc.connect() + logging.info ("Database connection with "+db_IP_address+" established successfully.") + except Exception as ex: + logging.error(str(ex)) + + + + # Execute every command from the input file + for command in sqlCommands: + # This will skip and report errors + # For example, if the tables do not yet exist, this will skip over + # the DROP TABLE commands + # Check if sql command empty + if not command.strip(): + continue + sqlresp='' + try: + logging.info("Executing command : "+command.strip('\n')) + sqlresp=conn.execute(command) + for row in sqlresp: + logging.info(row) + # for key, value in row.items(): + # logging.info(str(key) + ' : ' + str(value)) + + except exc.SQLAlchemyError as e: + logging.warn(type(e)) + complete_err = str(e.orig.args) + # ignore table does not exist, object does not exist, database already exists errors, storage does not exist, view does not exist; + # add any errors that you want to be ignored + if (("[Error 3802]" in complete_err) or ("[Error 3807]" in complete_err) or ("[Error 6938]" in complete_err) or ("[Error 5612]" in complete_err) or ("[Error 4836]" in complete_err) or ("[Error 3706]" in complete_err)): + logging.warn("Ignoring error "+complete_err.partition('\\n'](0]) + else: + logging.error("Terminating execution because of error "+complete_err.partition('\\n'](0]) + raise + + conn.close + + +def _cleanup(): + try: + logging.info ("Calling execute SQL string.") + executeSQLString(db_user, db_password, db_IP_address, SQL_string_cleanup) + logging.info ("Completed execute SQL files.") + except Exception as ex: + logging.error(str(ex)) + +def _create_employee(): + try: + logging.info ("Calling execute SQL string.") + executeSQLString(db_user, db_password, db_IP_address, SQL_string_create_employee) + logging.info ("Completed execute SQL files.") + except Exception as ex: + logging.error(str(ex)) + +def _create_organization(): + try: + logging.info ("Calling execute SQL string.") + executeSQLString(db_user, db_password, db_IP_address, SQL_string_create_organization) + logging.info ("Completed execute SQL files.") + except Exception as ex: + logging.error(str(ex)) + +def _run_query(): + try: + logging.info ("Calling execute SQL string.") + executeSQLString(db_user, db_password, db_IP_address, SQL_string_select) + logging.info ("Completed execute SQL files.") + except Exception as ex: + logging.error(str(ex)) + + + +with DAG("db_test_example", start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), + schedule_interval=None, catchup=False) as dag: + + + cleanup = PythonOperator( + task_id="cleanup", + python_callable=_cleanup, + depends_on_past=False + ) + + create_employee = PythonOperator( + task_id="create_employee", + python_callable=_create_employee, + depends_on_past=False + ) + + create_organization = PythonOperator( + task_id="create_organization", + python_callable=_create_organization, + depends_on_past=False + ) + + run_query = PythonOperator( + task_id="run_query", + python_callable=_run_query, + depends_on_past=False + ) + + + + + + + + +cleanup >> [create_employee, create_organization] >> run_query + + + diff --git a/quickstarts/other-integrations/attachments/execute-airflow-workflows-that-use-dbt-with-teradata-vantage/discover_dag.py b/quickstarts/other-integrations/attachments/execute-airflow-workflows-that-use-dbt-with-teradata-vantage/discover_dag.py new file mode 100644 index 0000000000..17b641f172 --- /dev/null +++ b/quickstarts/other-integrations/attachments/execute-airflow-workflows-that-use-dbt-with-teradata-vantage/discover_dag.py @@ -0,0 +1,489 @@ +# Airflow DAG to load a generic number of parquet, csv and json files into a Teradata 20 database on Amazon Web Services (AWS). +# The files are assumed to be located on specific S3 buckets (location defined in Airflow variables - csv files go to the csv +# S3 bucket, json files to the json bucket, parquet into the parquet bucket). +# The script locates the files, determines the structure of the files (columns, delimiters, etc.) and: +# 1. Creates the needed databases (scv database for csv files, json database for json files, multiple parquet databases are created based on the +# parquet directory names). If databases are already created, it skips this step. +# 2. Creates a teradata foreign table that point to the files +# 3. Creates a NOS table (uses S3 storage) and uses the previously created foreign tables to load them (doing select from +# foreign tables / insert into NOS tables). +# +# The Airflow environment must be created by using a docker_compose.yaml and Dockerfile to include all the needed packages and libraries. + + +from datetime import datetime, timedelta +from airflow.decorators import dag,task +from airflow import AirflowException +from airflow.models import Variable +import teradatasql +import logging +import getpass +import urllib.parse +import sqlalchemy +from sqlalchemy import exc +from sqlalchemy.dialects import registry +from airflow import DAG +from airflow.operators.python import PythonOperator, BranchPythonOperator +from airflow.operators.bash import BashOperator +from airflow.operators.dummy_operator import DummyOperator +import os +import sys +import ijson +import json +import subprocess +import csv +import boto3 + +# Airflow variables that must be imported before running this DAG. +# A sample variables.json file is provided aas an example +# AWS keys: +aws_access_key_id =Variable.get("aws_access_key_id") +aws_secret_access_key =Variable.get("aws_secret_access_key") +# S3 Locations. Ls locations (to be used for the aws ls command line interface) have a different format than ft locations (where files reside) +s3_location_parq_ls =Variable.get("s3_location_parq_ls") +s3_location_parq_ft =Variable.get("s3_location_parq_ft") +s3_location_csv_ls =Variable.get("s3_location_csv_ls") +s3_location_csv_create =Variable.get("s3_location_csv_create") +s3_location_csv_ft =Variable.get("s3_location_csv_ft") +s3_location_json_ls =Variable.get("s3_location_json_ls") +s3_location_json_ft =Variable.get("s3_location_json_ft") +s3_location_json_create =Variable.get("s3_location_json_create") +# s3 bucket is the top S3 bucket where the data resides (and where the parquet directories start), csv and json are subbuckets where these types of +# files reside +s3_bucket =Variable.get("s3_bucket") +csv_subbucket =Variable.get("csv_subbucket") +json_subbucket =Variable.get("json_subbucket") +# Temp file where the list of databases to be created reside +filenamedb=Variable.get("filenamedb") +# Location of temporary files where databasename tablename are listed for the program fo create tables +parqfilenamedbtab=Variable.get("parqfilenamedbtab") +csvfilenamedbtab=Variable.get("csvfilenamedbtab") +jsonfilenamedbtab=Variable.get("jsonfilenamedbtab") +alldbtab =Variable.get("alldbtab") +# Csv variables, database name and delimiters supported +csvdb=Variable.get("csvdb") +supported_csvdelimiters=Variable.get("supported_csvdelimiters") +supported_csvlineterminator=Variable.get("supported_csvlineterminator") +# Json db name +jsondb=Variable.get("jsondb") +# Target database variables - the DB user must have database create privileges - the csvdb and jsondb are going to be created under the user's +# datbase. +DB_username =Variable.get("DB_username") +DB_password =Variable.get("DB_password") +DB_ip_address =Variable.get("DB_ip_address") +# Authorization object name +auth_name =Variable.get("auth_name") +region_name=Variable.get("region_name") +# Temp file used to determine json file format +output_file =Variable.get("output_file") +# Sample size (num of lines) to determine csv format +linenumax=int(Variable.get("linenumax")) +# Permanent size for csv and json databases. Here one size fits all, to change it has to be slightly modified +perm_dbsize=Variable.get("perm_dbsize") +# NOS storage name +nos_storage=Variable.get("nos_storage") +# Flags to let the program know which types of files to load. 'Y' to loas the specific file type. +load_csv=Variable.get("load_csv") +load_json=Variable.get("load_json") +load_parquet=Variable.get("load_parquet") + + + +# Genertes the JSON nos select command for table table_name +def get_json_nos_select_comm(table_name): + try: + + command_getvalues = 'select ' + json_sample_size = 'top 100' + + logging.info("Generating JSON nos select command") + + command_getfields = 'select * from (SELECT distinct * FROM JSON_KEYS (ON (SELECT ' + json_sample_size + ' payload FROM ' + table_name + ' )) AS j ) as cols;' + + logging.info("Command to get json fields: " + command_getfields) + + registry.register("teradatasql", "teradatasqlalchemy.dialect", "TeradataDialect") + enginedbc = sqlalchemy.create_engine('teradatasql://'+DB_ip_address+'/?user='+DB_username+'&password='+DB_password, connect_args={'sslmode': "DISABLE"}) + conn = enginedbc.connect() + logging.info("Database connection with "+DB_ip_address+" established successfully.") + sqlrespfields=conn.execute(command_getfields) + for row in sqlrespfields: + for key, value in row.items(): + fieldname = '"payload".' + str(value) + ' ' + value.replace('"."','__').replace('"','') + print(fieldname) + command_getvalues = command_getvalues + fieldname + ', ' + command_getvalues = command_getvalues[:-3] + command_getvalues = command_getvalues + ' from ' + table_name + logging.info('JSON nos select command: \n\n' + command_getvalues + '\n') + conn.close + return(command_getvalues) + except Exception as ex: + logging.error(str(ex)) + raise AirflowException + + + +# Execute a string of SQL commands separated by semicolons (;) +def execute_sql_commands(commands): + try: + logging.info ("SQL commands: " + commands) + sqlcommands = commands.split(';') + registry.register("teradatasql", "teradatasqlalchemy.dialect", "TeradataDialect") + enginedbc = sqlalchemy.create_engine('teradatasql://'+DB_ip_address+'/?user='+DB_username+'&password='+DB_password, connect_args={'sslmode': "DISABLE"}) + conn = enginedbc.connect() + logging.info ("Database connection with "+DB_ip_address+" established successfully.") + + + # files to tbl: + for sqlcommand in sqlcommands: + try: + logging.info ("SQL Command: " + sqlcommand) + sqlresp=conn.execute(sqlcommand) + for row in sqlresp: + logging.info(row) + + except exc.SQLAlchemyError as e: + logging.warn(type(e)) + complete_err = str(e.orig.args) + # ignore table does not exist, object does not exist, database already exists errors, storage does not exist, view does not exist + if (("[Error 3802]" in complete_err) or ("[Error 3807]" in complete_err) or ("[Error 6938]" in complete_err) or ("[Error 5612]" in complete_err) or ("[Error 4836]" in complete_err) or ("[Error 3706]" in complete_err)): + logging.warn("Ignoring error "+complete_err.partition('\\n'](0]) + continue + else: + logging.error("Terminating execution because of error "+complete_err.partition('\\n'](0]) + raise AirflowException + + conn.close + + except Exception as ex: + logging.error(str(ex)) + raise AirflowException + + + +# Returns bash script string containing the script that creates a file containing the database names to be created +def create_db_file_bash(filenamedb): + empty_bash_str = 'touch ' + filenamedb + ';' + csv_bash_str = 'echo \'' + csvdb + '\' >> ' + filenamedb + ';' + json_bash_str = 'echo \'' + jsondb + '\' >> ' + filenamedb + ';' + parquet_bash_str = 'export AWS_ACCESS_KEY_ID=' + aws_access_key_id + ' ; export AWS_SECRET_ACCESS_KEY=' + aws_secret_access_key + '; aws s3 ls ' + s3_location_parq_ls + ' | awk \'{print $2}\' | sed \'s#/##\' >> ' + filenamedb + ';' + create_db_file_bash_command = empty_bash_str + if (load_csv == 'Y'): + create_db_file_bash_command = create_db_file_bash_command + csv_bash_str + if (load_json == 'Y'): + create_db_file_bash_command = create_db_file_bash_command + json_bash_str + if (load_parquet == 'Y'): + create_db_file_bash_command = create_db_file_bash_command + parquet_bash_str + logging.info ("Returning db file creation bash command: " + create_db_file_bash_command) + return (create_db_file_bash_command) + + +# SQL and Bash scripts + +# Bash command to create placeholder empty files +create_placeholder_files_command = 'touch ' + csvfilenamedbtab + '; touch ' + jsonfilenamedbtab + '; touch ' + parqfilenamedbtab + '; touch ' + alldbtab + +# Bash command to create a file containing the names of parquet files to be loaded +create_parq_db_tab_file_bash_command = 'export AWS_ACCESS_KEY_ID=' + aws_access_key_id + ' ; export AWS_SECRET_ACCESS_KEY=' + aws_secret_access_key + '; for DB in `aws s3 ls ' + s3_location_parq_ls + '| awk \'{print $2}\' | sed \'s#/##\' `; do aws s3 ls ' + s3_location_parq_ls + '$DB/ | awk \'{print db,$2}\' db="${DB}" | sed \'s#/##\'; done > ' + parqfilenamedbtab + +# Bash command to create a temporary file containing the names of csv files to be loaded +create_csv_tab_file_bash_command = 'export AWS_ACCESS_KEY_ID=' + aws_access_key_id + ' ; export AWS_SECRET_ACCESS_KEY=' + aws_secret_access_key + '; aws s3 ls ' + s3_location_csv_ls + ' | awk \'{print "+csv+ " $4}\' | sed \'s#/##\' | tail -n +2 > ' + csvfilenamedbtab + + +# Bash command to create a temporary file containing the names of json files to be loaded +create_json_tab_file_bash_command = 'export AWS_ACCESS_KEY_ID=' + aws_access_key_id + ' ; export AWS_SECRET_ACCESS_KEY=' + aws_secret_access_key + '; aws s3 ls ' + s3_location_json_ls + ' | awk \'{print "+json+ " $4}\' | sed \'s#/##\' | tail -n +2 > ' + jsonfilenamedbtab + +# Bash commands to create a temporary file containing the names of all (json, csv, parquet) files to be loaded +join_csv_tab_files_bash_command = 'cat ' + csvfilenamedbtab + ' >> ' + alldbtab +join_json_tab_files_bash_command = 'cat ' + jsonfilenamedbtab + ' >> ' + alldbtab +join_parquet_tab_files_bash_command = 'cat ' + parqfilenamedbtab + ' >> ' + alldbtab + +# Bash command to clean up files containing table and database lists from the previous run if they exist +cleanup_bash_command = 'rm -f ' + filenamedb + ' ' + parqfilenamedbtab + ' ' + csvfilenamedbtab + ' ' + jsonfilenamedbtab + ' ' + alldbtab + +# Returns the delimiter of the csv file. Supported csv delimiters are defined by the supported_csvdelimiters variable. +# S3 bucket and file name are passed as arguments. +def csv_delimiter(bucket, file): + try: + s3 = boto3.resource( 's3', region_name=region_name, aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key) + # bucket = topmost bucket, like tc-001-teracloud-nos-us-west-2-3745abcd, file = filename incl. lower buckets, ex csvdata/inventory.csv + # where complete path is tc-001-teracloud-nos-us-west-2-3745abcd/csvdata/inventory.csv + obj = s3.Object(bucket,file) + line = obj.get(]('Body']._raw_stream.readline().decode('UTF-8') + dialect = csv.Sniffer().sniff(line, delimiters=supported_csvdelimiters) + delimiter = dialect.__dict__['delimiter'] + return(delimiter) + except Exception as ex: + logging.error(str(ex)) + raise AirflowException + + + + + +# Returns the JSON fields (columns) in the file delimited by the '|' character. In case the Json file is nested the columns are flattened. +# A sample of the JSON file (numner of lines defined by the linenumax variable) is copied from S3 to the filesystem and examined. +# then ijson is used to examine it. Linenumax is by default set to 100, but for complex files can be increased. +def json_fields(bucket, file): + try: + + s3 = boto3.resource( 's3', region_name=region_name, aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key) + # bucket = topmost bucket, like tc-perf-001-teracloud-nos-us-west-2-3745a70d0aef, file = filename incl. lower buckets, ex csvdata/inventory.csv + # where complete path is tc-perf-001-teracloud-nos-us-west-2-3745a70d0aef/csvdata/inventory.csv + logging.info('Json-fields, Bucket: ' + bucket + ', File: ' + file) + obj = s3.Object(bucket,file) + + if os.path.exists(output_file): + os.remove(output_file) + + f = open(output_file,'w+') + + linenum=1 + while linenum <= linenumax: + # line = print_line(s3_bucket,json_subbucket + '/' + 'pd_review.json') + line = obj.get(]('Body']._raw_stream.readline().decode('UTF-8') + f.write(line) + linenum = linenum + 1 + + f.seek(0) + + logging.info('Json-fields, Out Temp File: ' + output_file) + objects = ijson.items(f, "", multiple_values=True) + logging.info('Json-fields, Json objects: ' + str(objects)) + + key_string="" + + for obj in objects: + first = True + for i in obj.keys(): + if first: + key_string=key_string+i + first = False + else: + key_string=key_string+'|'+i + break + + f.close + logging.info('Json-fields, Field string: ' + key_string) + + return(key_string) + + + except Exception as ex: + logging.error(str(ex)) + raise AirflowException + + + + + +default_args = { + 'owner': 'airflow', + 'depends_on_past': False, + 'start_date': datetime(2020,8,1), + 'retries': 0 +} + + +@dag(dag_id="discover_dag", schedule_interval=None, start_date=datetime(2022, 4, 2)) +def taskflow(): + + + # Create a temporary file containing the names of all databases. + # CSV database name comes from the variable csvdb, JSON database from variable jsondb, Parquet database(s) from the parquet sub-bucket name(s) + # The file is created in the directory name defined by the variable filenamedb. By default this is /tmp/db.txt on the host system or + # /opt/airflow/tmp/db.txt on the container, but is configurable by changing the variable value and the /tmp mount in the docker_compose.yaml file + @task + def make_file_db(): + logging.info ("Cleaning up old files : " + cleanup_bash_command) + subprocess.run(cleanup_bash_command, shell=True, check=True, executable='/bin/bash') + logging.info ("Executing bash: " + create_db_file_bash(filenamedb)) + createdb_file_bash_command = create_db_file_bash(filenamedb) + subprocess.run(createdb_file_bash_command, shell=True, check=True, executable='/bin/bash') + return (filenamedb) + + # Create databases based on the database names found in the file created in the make_file_db task + # Notice the password is the same as the database name, manually change the password as needed + @task + def create_db(filenamedb): + try: + logging.info ("Opening file " + filenamedb) + with open(filenamedb) as file: + lines = file.readlines() + lines = [line.rstrip() for line in lines] + logging.info ("File "+filenamedb+" found, opened, read successfully.") + + # for each line in the db file (i.e. for each database), create user/database and auth object to access S3 + # databases are created all of the same size because the data will not be loaded into the databasebut in the NOS storage + for line in lines: + sqlcommandstr = "create user " + line + " as perm=" + perm_dbsize + ",password=" + line + "; grant all on " + line + " to " + line + " with grant option; grant create database on " + line + " to " + line + "; grant EXECUTE FUNCTION on TD_SYSFNLIB to " + line + "; database " + line + "; drop AUTHORIZATION " + line + "." + auth_name + "; CREATE AUTHORIZATION " + line + "." + auth_name + " AS DEFINER TRUSTED USER '" + aws_access_key_id + "' PASSWORD '" + aws_secret_access_key + "';" + execute_sql_commands(sqlcommandstr) + file.close() + os.remove(filenamedb) + return(parqfilenamedbtab) + + except Exception as ex: + logging.error(str(ex)) + raise AirflowException + + + + + # Create temporary files containing all the table names. Bash commands use aws command line create a list of files/tables + # The argument parquetfilenamedb is a placeholder to support the airflow task flow. + @task + def make_file_dbtab(parqfilenamedbtab): + logging.info ("Create empty files bash: " + create_placeholder_files_command) + subprocess.run(create_placeholder_files_command, shell=True, check=True, executable='/bin/bash') + if (load_csv == 'Y'): + logging.info ("Executing csv bash: " + create_csv_tab_file_bash_command) + subprocess.run(create_csv_tab_file_bash_command, shell=True, check=True, executable='/bin/bash') + logging.info ("Executing csv join bash: " + join_csv_tab_files_bash_command) + subprocess.run(join_csv_tab_files_bash_command, shell=True, check=True, executable='/bin/bash') + if (load_json == 'Y'): + logging.info ("Executing json bash: " + create_json_tab_file_bash_command) + subprocess.run(create_json_tab_file_bash_command, shell=True, check=True, executable='/bin/bash') + logging.info ("Executing json join bash: " + join_json_tab_files_bash_command) + subprocess.run(join_json_tab_files_bash_command, shell=True, check=True, executable='/bin/bash') + if (load_parquet == 'Y'): + logging.info ("Executing parq bash: " + create_parq_db_tab_file_bash_command) + subprocess.run(create_parq_db_tab_file_bash_command, shell=True, check=True, executable='/bin/bash') + logging.info ("Executing parquet join bash: " + join_parquet_tab_files_bash_command) + subprocess.run(join_parquet_tab_files_bash_command, shell=True, check=True, executable='/bin/bash') + # logging.info ("Executing join_file bash: " + join_tab_files_bash_command) + # subprocess.run(join_tab_files_bash_command, shell=True, check=True, executable='/bin/bash') + return(alldbtab) + + # Open filename created by the make_file_dbtab task containing the table names and return the content + @task + def make_tab_list(filename): + # Open and read the file as a single buffer, then split sql commnds based on the ";" character, i.e. commands must be separated by ";" + logging.info ("Opening file " + filename) + try: + with open(filename) as file: + lines = file.readlines() + lines = [line.rstrip() for line in lines] + file.close() + # os.remove(filename) + logging.info ("File "+filename+" found, opened, read successfully.") + return (lines) + except Exception as ex: + logging.error ("File error ", str (ex).split ("\n") [0]) + raise AirflowException + + + # Based on the list of tables passed by the previous task and create the tables. + # The tables can be csv (prefixed by +csv+), json (prefixed by +json+) or parquet (no +parquet+ prefix, but simply database and table name) . + # Each file type has a different creation process and SQL code. + @task + def create_tables(arg): + logging.info ("Creating table for record :"+arg) + argstring = arg.split(' ') + i = 1 + db="" + tbl="" + tbltype="" + csvfilename = "" + jsonfilename = "" + for argstr in argstring: + argstr = argstr.strip() + logging.info ('Arg passed ' + str(i) + ': ' + argstr + ';') + if (i == 1) : + if (argstr == '+csv+'): + tbltype = 'csv' + db = csvdb + elif (argstr == '+json+'): + tbltype = 'json' + db = jsondb + else : + tbltype = 'parquet' + db = argstr + i = i + 1 + elif (i == 2): + if (tbltype == 'csv'): + csvfilename = argstr + tbl = argstr.split('.',1](0] + db = csvdb + bucketfile = csv_subbucket + '/' + csvfilename + csvdelimiter = csv_delimiter(s3_bucket, csv_subbucket + '/' + csvfilename ) + logging.info ('CSV File path :' + bucketfile) + logging.info ('CSV Delimiter :' + csvdelimiter) + elif (tbltype == 'json'): + jsonfilename = argstr + tbl = argstr.split('.',1](0] + db = jsondb + bucketfile = json_subbucket + '/' + jsonfilename + jsonfieldstr = json_fields(s3_bucket, json_subbucket + '/' + jsonfilename ) + logging.info ('JSON File path :' + bucketfile) + logging.info ('JSON Fields String :' + jsonfieldstr) + else : + tbl = argstr + + + logging.info ("Table type :" + tbltype + " Table name: " + tbl + " Database: " + db) + + + + + + + if (tbltype == 'parquet'): + + sqlstr_parq_ft = "drop FOREIGN TABLE " + db + "." + tbl + "_parq_ft; CREATE FOREIGN TABLE " + db + "." + tbl + "_parq_ft ,EXTERNAL SECURITY DEFINER TRUSTED " + auth_name + " USING ( LOCATION ('" + s3_location_parq_ft + db + "/" + tbl + "/') STOREDAS ('PARQUET') ) NO PRIMARY INDEX PARTITION BY COLUMN; select cast(count(*) as bigint) from " + db + "." + tbl + "_parq_ft;" + + logging.info ("Parquet foreign table string:" + sqlstr_parq_ft) + + + sqlstr_parq_nosfs = "drop TABLE " + db + "." + tbl + "_parq_nos; CREATE MULTISET TABLE " + db + "." + tbl + "_parq_nos, STORAGE = " + nos_storage + " as ( select * from antiselect ( on " + db + "." + tbl + "_parq_ft using exclude ('location')) as tbl) with data no primary index; select cast(count(*) as bigint) from " + db + "." + tbl + "_parq_nos; select cast(count(*) as bigint) from " + db + "." + tbl + "_parq_ft;" + + logging.info ("Parquet nosfs table string:" + sqlstr_parq_nosfs) + + + parq_sqlstr_all = sqlstr_parq_ft + sqlstr_parq_nosfs + + execute_sql_commands(parq_sqlstr_all) + + + elif (tbltype == 'csv'): + sqlstr_csv_ft = 'drop FOREIGN TABLE ' + csvdb + '.' + tbl + '_csv_ft; CREATE FOREIGN TABLE ' + csvdb + '.' + tbl + '_csv_ft ,EXTERNAL SECURITY DEFINER TRUSTED ' + auth_name + ' USING ( LOCATION (\'' + s3_location_csv_create + '/' + csvfilename +'\') ROWFORMAT ('+'\'{"field_delimiter":"' + csvdelimiter + '","record_delimiter":"\\n","character_set":"LATIN"}\') HEADER (\'TRUE\')); select cast(count(*) as bigint) from ' + csvdb + '.' + tbl + '_csv_ft;' + + sqlstr_csv_nosfs = "drop TABLE " + csvdb + "." + tbl + "_csv_nos; CREATE MULTISET TABLE " + csvdb + "." + tbl + "_csv_nos, STORAGE = " + nos_storage + " as ( select * from antiselect ( on " + csvdb + "." + tbl + "_csv_ft using exclude ('location')) as tbl) with data no primary index; select cast(count(*) as bigint) from " + csvdb + "." + tbl + "_csv_nos; select cast(count(*) as bigint) from " + csvdb + "." + tbl + "_csv_ft;" + + logging.info ("Csv nosfs table string:" + sqlstr_csv_nosfs) + + csv_sqlstr_all = sqlstr_csv_ft + sqlstr_csv_nosfs + + # sqlcommands = csv_sqlstr_all.split(';') + execute_sql_commands(csv_sqlstr_all) + + elif (tbltype == 'json'): + + sqlstr_json_ft = 'drop FOREIGN TABLE ' + jsondb + '.' + tbl + '_json_ft; CREATE FOREIGN TABLE ' + jsondb + '.' + tbl + '_json_ft ,EXTERNAL SECURITY DEFINER TRUSTED ' + auth_name + ' USING ( LOCATION (\'' + s3_location_json_create + '/' + jsonfilename +'\')); select cast(count(*) as bigint) from ' + jsondb + '.' + tbl + '_json_ft;' + + + execute_sql_commands(sqlstr_json_ft) + + sqlstr_json_select = get_json_nos_select_comm(jsondb + '.' + tbl + '_json_ft') + + + sqlstr_json_nosfs = "drop TABLE " + jsondb + "." + tbl + "_json_nos; CREATE MULTISET TABLE " + jsondb + "." + tbl + "_json_nos, STORAGE = " + nos_storage + " as ( " + sqlstr_json_select + " ) with data no primary index; select cast(count(*) as bigint) from " + jsondb + "." + tbl + "_json_nos; select cast(count(*) as bigint) from " + jsondb + "." + tbl + "_json_ft;" + + execute_sql_commands(sqlstr_json_nosfs) + + + + + + + + + + + + + create_tables.expand(arg=make_tab_list(make_file_dbtab(create_db(make_file_db())))) + + +dag = taskflow() + + diff --git a/quickstarts/other-integrations/attachments/execute-airflow-workflows-that-use-dbt-with-teradata-vantage/docker-compose.yaml b/quickstarts/other-integrations/attachments/execute-airflow-workflows-that-use-dbt-with-teradata-vantage/docker-compose.yaml new file mode 100644 index 0000000000..82d30f487d --- /dev/null +++ b/quickstarts/other-integrations/attachments/execute-airflow-workflows-that-use-dbt-with-teradata-vantage/docker-compose.yaml @@ -0,0 +1,351 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +# Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL. +# 3 workers are created, others can be added. +# Added nginx web server for the dbt use case. +# +# WARNING: This configuration is for local development. Do not use it in a production deployment. +# +# This configuration supports basic configuration using environment variables or an .env file +# The following variables are supported: +# +# AIRFLOW_IMAGE_NAME - Docker image name used to run Airflow. +# Default: apache/airflow:|version| +# AIRFLOW_UID - User ID in Airflow containers +# Default: 50000 +# Those configurations are useful mostly in case of standalone testing/running Airflow in test/try-out mode +# +# _AIRFLOW_WWW_USER_USERNAME - Username for the administrator account (if requested). +# Default: airflow +# _AIRFLOW_WWW_USER_PASSWORD - Password for the administrator account (if requested). +# Default: airflow +# _PIP_ADDITIONAL_REQUIREMENTS - Additional PIP requirements to add when starting all containers. +# Default: '' +# +# Feel free to modify this file to suit your needs. +--- +version: '3' +x-airflow-common: + &airflow-common + # In order to add custom dependencies or upgrade provider packages you can use your extended image. + # Comment the image line, place your Dockerfile in the directory where you placed the docker-compose.yaml + # and uncomment the "build" line below, Then run `docker-compose build` to build the images. + image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.2.4} + build: . + environment: + &airflow-common-env + AIRFLOW__CORE__EXECUTOR: CeleryExecutor + AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow + # For backward compatibility, with Airflow <2.3 + AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow + AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow + AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0 + AIRFLOW__CORE__FERNET_KEY: '' + AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true' + AIRFLOW__CORE__LOAD_EXAMPLES: 'true' + AIRFLOW__API__AUTH_BACKENDS: 'airflow.api.auth.backend.basic_auth' + _PIP_ADDITIONAL_REQUIREMENTS: '' + # _PIP_ADDITIONAL_REQUIREMENTS will be implemented in the Dockerfile, that is why it is commented out here + # _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:- sqlalchemy sqlalchemy-teradata teradatasql teradatasqlalchemy dbt-teradata} + volumes: + # Volumes host system directories (in this example host system is an AWS EC2 Linux instance) which will be mounted / readable / writable from all containers. + # The first directory is the path on the host system, the second (separated by ":") is the path on the docker container. + # These will have to be changed for a different setups / systems + # ./dags - Airflow dags directory where the dag python files are places + - ./dags:/opt/airflow/dags + # ./logs - Airflow logs directory + - ./logs:/opt/airflow/logs + # plugins - Airflow plugin directory + - ./plugins:/opt/airflow/plugins + # airflow.cfg - airflow configuration file used when airflow is started on the container + - ./config/airflow.cfg:/opt/airflow/airflow.cfg + # /tmp - temporary directory used to create / store temporary files + - /tmp:/opt/airflow/tmp + # The dbt directory (here installed under /home/ec3-user) contains the dbt project + - /home/ec2-user/dbt/jaffle_shop/data:/opt/airflow/data + - /home/ec2-user/dbt/jaffle_shop/dbt_project.yml:/opt/airflow/dbt_project.yml + - /home/ec2-user/dbt/jaffle_shop/etc:/opt/airflow/etc + - /home/ec2-user/dbt/jaffle_shop/LICENSE:/opt/airflow/LICENSE + - /home/ec2-user/dbt/jaffle_shop/models:/opt/airflow/models + # The .dbt directory contain the .dbt configuration files + - /home/ec2-user/.dbt:/home/airflow/.dbt + - /home/ec2-user/dbt/jaffle_shop/target:/opt/airflow/target + user: "${AIRFLOW_UID:-50000}:0" + depends_on: + &airflow-common-depends-on + redis: + condition: service_healthy + postgres: + condition: service_healthy + +services: + postgres: + image: postgres:13 + environment: + POSTGRES_USER: airflow + POSTGRES_PASSWORD: airflow + POSTGRES_DB: airflow + volumes: + - postgres-db-volume:/var/lib/postgresql/data + healthcheck: + test: ["CMD", "pg_isready", "-U", "airflow"] + interval: 5s + retries: 5 + restart: always + + redis: + image: redis:latest + expose: + - 6379 + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 5s + timeout: 30s + retries: 50 + restart: always + # nginx added to visualize on a web browser the DBT generated documents. Nginx is here configured on host port 4000 + nginx: + image: nginx + ports: + - 4000:80 + volumes: + - /home/ec2-user/dbt/jaffle_shop/target:/usr/share/nginx/html + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost"] + interval: 1m30s + timeout: 10s + retries: 3 + start_period: 1m #version 3.4 minimum + + + airflow-webserver: + <<: *airflow-common + command: webserver + ports: + - 8080:8080 + healthcheck: + test: ["CMD", "curl", "--fail", "http://localhost:8080/health"] + interval: 10s + timeout: 10s + retries: 5 + restart: always + depends_on: + <<: *airflow-common-depends-on + airflow-init: + condition: service_completed_successfully + + airflow-scheduler: + <<: *airflow-common + command: scheduler + healthcheck: + test: ["CMD-SHELL", 'airflow jobs check --job-type SchedulerJob --hostname "$${HOSTNAME}"'] + interval: 10s + timeout: 10s + retries: 5 + restart: always + depends_on: + <<: *airflow-common-depends-on + airflow-init: + condition: service_completed_successfully + # Three workers installed so airflow can in parallel execute up to 3 tasks. If more are needed, just /cut/paste/add/rename additional worker config sessions + airflow-worker_1: + <<: *airflow-common + command: celery worker + healthcheck: + test: + - "CMD-SHELL" + - 'celery --app airflow.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}"' + interval: 10s + timeout: 10s + retries: 5 + environment: + <<: *airflow-common-env + # Required to handle warm shutdown of the celery workers properly + # See https://airflow.apache.org/docs/docker-stack/entrypoint.html#signal-propagation + DUMB_INIT_SETSID: "0" + restart: always + depends_on: + <<: *airflow-common-depends-on + airflow-init: + condition: service_completed_successfully + + airflow-worker_2: + <<: *airflow-common + command: celery worker + healthcheck: + test: + - "CMD-SHELL" + - 'celery --app airflow.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}"' + interval: 10s + timeout: 10s + retries: 5 + environment: + <<: *airflow-common-env + # Required to handle warm shutdown of the celery workers properly + # See https://airflow.apache.org/docs/docker-stack/entrypoint.html#signal-propagation + DUMB_INIT_SETSID: "0" + restart: always + depends_on: + <<: *airflow-common-depends-on + airflow-init: + condition: service_completed_successfully + + airflow-worker_3: + <<: *airflow-common + command: celery worker + healthcheck: + test: + - "CMD-SHELL" + - 'celery --app airflow.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}"' + interval: 10s + timeout: 10s + retries: 5 + environment: + <<: *airflow-common-env + # Required to handle warm shutdown of the celery workers properly + # See https://airflow.apache.org/docs/docker-stack/entrypoint.html#signal-propagation + DUMB_INIT_SETSID: "0" + restart: always + depends_on: + <<: *airflow-common-depends-on + airflow-init: + condition: service_completed_successfully + + airflow-triggerer: + <<: *airflow-common + command: triggerer + healthcheck: + test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"'] + interval: 10s + timeout: 10s + retries: 5 + restart: always + depends_on: + <<: *airflow-common-depends-on + airflow-init: + condition: service_completed_successfully + + airflow-init: + <<: *airflow-common + entrypoint: /bin/bash + # yamllint disable rule:line-length + command: + - -c + - | + function ver() { + printf "%04d%04d%04d%04d" $${1//./ } + } + airflow_version=$$(gosu airflow airflow version) + airflow_version_comparable=$$(ver $${airflow_version}) + min_airflow_version=2.2.0 + min_airflow_version_comparable=$$(ver $${min_airflow_version}) + if (( airflow_version_comparable < min_airflow_version_comparable )); then + echo + echo -e "\033[1;31mERROR!!!: Too old Airflow version $${airflow_version}!\e[0m" + echo "The minimum Airflow version supported: $${min_airflow_version}. Only use this or higher!" + echo + exit 1 + fi + if [[ -z "${AIRFLOW_UID}" ]]; then + echo + echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m" + echo "If you are on Linux, you SHOULD follow the instructions below to set " + echo "AIRFLOW_UID environment variable, otherwise files will be owned by root." + echo "For other operating systems you can get rid of the warning with manually created .env file:" + echo " See: https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html#setting-the-right-airflow-user" + echo + fi + one_meg=1048576 + mem_available=$$(($$(getconf _PHYS_PAGES) * $$(getconf PAGE_SIZE) / one_meg)) + cpus_available=$$(grep -cE 'cpu[0-9]+' /proc/stat) + disk_available=$$(df / | tail -1 | awk '{print $$4}') + warning_resources="false" + if (( mem_available < 4000 )) ; then + echo + echo -e "\033[1;33mWARNING!!!: Not enough memory available for Docker.\e[0m" + echo "At least 4GB of memory required. You have $$(numfmt --to iec $$((mem_available * one_meg)))" + echo + warning_resources="true" + fi + if (( cpus_available < 2 )); then + echo + echo -e "\033[1;33mWARNING!!!: Not enough CPUS available for Docker.\e[0m" + echo "At least 2 CPUs recommended. You have $${cpus_available}" + echo + warning_resources="true" + fi + if (( disk_available < one_meg * 10 )); then + echo + echo -e "\033[1;33mWARNING!!!: Not enough Disk space available for Docker.\e[0m" + echo "At least 10 GBs recommended. You have $$(numfmt --to iec $$((disk_available * 1024 )))" + echo + warning_resources="true" + fi + if [[ $${warning_resources} == "true" ]]; then + echo + echo -e "\033[1;33mWARNING!!!: You have not enough resources to run Airflow (see above)!\e[0m" + echo "Please follow the instructions to increase amount of resources available:" + echo " https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html#before-you-begin" + echo + fi + mkdir -p /sources/logs /sources/dags /sources/plugins + chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins} + exec /entrypoint airflow version + # yamllint enable rule:line-length + environment: + <<: *airflow-common-env + _AIRFLOW_DB_UPGRADE: 'true' + _AIRFLOW_WWW_USER_CREATE: 'true' + _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow} + _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow} + user: "0:0" + volumes: + - .:/sources + + airflow-cli: + <<: *airflow-common + profiles: + - debug + environment: + <<: *airflow-common-env + CONNECTION_CHECK_MAX_COUNT: "0" + # Workaround for entrypoint issue. See: https://github.com/apache/airflow/issues/16252 + command: + - bash + - -c + - airflow + + flower: + <<: *airflow-common + command: celery flower + ports: + - 5555:5555 + healthcheck: + test: ["CMD", "curl", "--fail", "http://localhost:5555/"] + interval: 10s + timeout: 10s + retries: 5 + restart: always + depends_on: + <<: *airflow-common-depends-on + airflow-init: + condition: service_completed_successfully + +volumes: + postgres-db-volume: diff --git a/quickstarts/other-integrations/attachments/execute-airflow-workflows-that-use-dbt-with-teradata-vantage/profiles.yml b/quickstarts/other-integrations/attachments/execute-airflow-workflows-that-use-dbt-with-teradata-vantage/profiles.yml new file mode 100644 index 0000000000..691c767cc8 --- /dev/null +++ b/quickstarts/other-integrations/attachments/execute-airflow-workflows-that-use-dbt-with-teradata-vantage/profiles.yml @@ -0,0 +1,15 @@ +jaffle_shop: + outputs: + dev: + type: teradata + host: 192.11.25.33 + user: jaffle_shop + password: abcd + logmech: TD2 + schema: jaffle_shop + tmode: ANSI + threads: 1 + timeout_seconds: 300 + priority: interactive + retries: 1 + target: dev diff --git a/quickstarts/other-integrations/attachments/execute-airflow-workflows-that-use-dbt-with-teradata-vantage/variables.json b/quickstarts/other-integrations/attachments/execute-airflow-workflows-that-use-dbt-with-teradata-vantage/variables.json new file mode 100644 index 0000000000..0905f82ac7 --- /dev/null +++ b/quickstarts/other-integrations/attachments/execute-airflow-workflows-that-use-dbt-with-teradata-vantage/variables.json @@ -0,0 +1,36 @@ +{ +"aws_access_key_id" : "*******", +"aws_secret_access_key" : "**************", +"s3_location_parq_ls" : "s3://tc-001-teracloud-nos-us-west-2-374222bfg/soc/nosexports/", +"s3_location_parq_ft" : "/s3/s3.amazonaws.com/tc-001-teracloud-nos-us-west-2-374222bfg/soc/nosexports/", +"s3_location_csv_ls" : "s3://tc-001-teracloud-nos-us-west-2-374222bfg/csvdata/", +"s3_location_csv_create" : "/s3/s3.amazonaws.com/tc-001-teracloud-nos-us-west-2-374222bfg/csvdata", +"s3_location_csv_ft" : "/s3/s3.amazonaws.com/tc-001-teracloud-nos-us-west-2-374222bfg/nosexports/", +"s3_location_json_ls" : "s3://tc-001-teracloud-nos-us-west-2-374222bfg/jsondata/", +"s3_location_json_ft" : "/s3/s3.amazonaws.com/tc-001-teracloud-nos-us-west-2-374222bfg/nosexports/", +"s3_location_json_create" : "/s3/s3.amazonaws.com/tc-001-teracloud-nos-us-west-2-374222bfg/jsondata", +"s3_bucket" : "tc-perf-teracloud-nos-us-west-2-374222bfg", +"csv_subbucket" : "csvdata", +"json_subbucket" : "jsondata", +"filenamedb" :"/opt/airflow/tmp/db.txt", +"parqfilenamedbtab" :"/opt/airflow/tmp/parqdbtbl.txt", +"csvfilenamedbtab" :"/opt/airflow/tmp/csvdbtbl.txt", +"jsonfilenamedbtab" :"/opt/airflow/tmp/jsondbtbl.txt", +"alldbtab" : "/opt/airflow/tmp/alldbtbl.txt", +"csvdb" :"csvdb", +"supported_csvdelimiters" : ",:|\t", +"supported_csvlineterminator" : "\n", +"jsondb" : "jsondb", +"DB_username" : "dbc", +"DB_password" : "dbc", +"DB_ip_address" : "***.***.***.***", +"auth_name" : "soc_Auth_NOS", +"region_name" : "us-west-2", +"output_file" : "/tmp/outfile.txt", +"perm_dbsize" : "5e9", +"nos_storage" : "TD_NOSFS_STORAGE", +"load_csv" : "Y", +"load_json" : "Y", +"load_parquet" : "Y", +"linenumax" : "100" +} diff --git a/quickstarts/other-integrations/images/configure-a-teradata-connection-in-datahub/create-new-source.png b/quickstarts/other-integrations/images/configure-a-teradata-connection-in-datahub/create-new-source.png new file mode 100644 index 0000000000..3c47435443 Binary files /dev/null and b/quickstarts/other-integrations/images/configure-a-teradata-connection-in-datahub/create-new-source.png differ diff --git a/quickstarts/other-integrations/images/configure-a-teradata-connection-in-datahub/datasets.png b/quickstarts/other-integrations/images/configure-a-teradata-connection-in-datahub/datasets.png new file mode 100644 index 0000000000..f2665ef5a8 Binary files /dev/null and b/quickstarts/other-integrations/images/configure-a-teradata-connection-in-datahub/datasets.png differ diff --git a/quickstarts/other-integrations/images/configure-a-teradata-connection-in-datahub/entities-list.png b/quickstarts/other-integrations/images/configure-a-teradata-connection-in-datahub/entities-list.png new file mode 100644 index 0000000000..9fc0de927a Binary files /dev/null and b/quickstarts/other-integrations/images/configure-a-teradata-connection-in-datahub/entities-list.png differ diff --git a/quickstarts/other-integrations/images/configure-a-teradata-connection-in-datahub/execute.png b/quickstarts/other-integrations/images/configure-a-teradata-connection-in-datahub/execute.png new file mode 100644 index 0000000000..1dc2d37bfb Binary files /dev/null and b/quickstarts/other-integrations/images/configure-a-teradata-connection-in-datahub/execute.png differ diff --git a/quickstarts/other-integrations/images/configure-a-teradata-connection-in-datahub/finish-up.png b/quickstarts/other-integrations/images/configure-a-teradata-connection-in-datahub/finish-up.png new file mode 100644 index 0000000000..b9d0aa83cc Binary files /dev/null and b/quickstarts/other-integrations/images/configure-a-teradata-connection-in-datahub/finish-up.png differ diff --git a/quickstarts/other-integrations/images/configure-a-teradata-connection-in-datahub/ingestion-icon.png b/quickstarts/other-integrations/images/configure-a-teradata-connection-in-datahub/ingestion-icon.png new file mode 100644 index 0000000000..ad99432a44 Binary files /dev/null and b/quickstarts/other-integrations/images/configure-a-teradata-connection-in-datahub/ingestion-icon.png differ diff --git a/quickstarts/other-integrations/images/configure-a-teradata-connection-in-datahub/ingestion-result.png b/quickstarts/other-integrations/images/configure-a-teradata-connection-in-datahub/ingestion-result.png new file mode 100644 index 0000000000..eee487a25a Binary files /dev/null and b/quickstarts/other-integrations/images/configure-a-teradata-connection-in-datahub/ingestion-result.png differ diff --git a/quickstarts/other-integrations/images/configure-a-teradata-connection-in-datahub/lineage-weather.png b/quickstarts/other-integrations/images/configure-a-teradata-connection-in-datahub/lineage-weather.png new file mode 100644 index 0000000000..5b09a28376 Binary files /dev/null and b/quickstarts/other-integrations/images/configure-a-teradata-connection-in-datahub/lineage-weather.png differ diff --git a/quickstarts/other-integrations/images/configure-a-teradata-connection-in-datahub/lineage.png b/quickstarts/other-integrations/images/configure-a-teradata-connection-in-datahub/lineage.png new file mode 100644 index 0000000000..edf5732fc6 Binary files /dev/null and b/quickstarts/other-integrations/images/configure-a-teradata-connection-in-datahub/lineage.png differ diff --git a/quickstarts/other-integrations/images/configure-a-teradata-connection-in-datahub/new-ingestion-source.png b/quickstarts/other-integrations/images/configure-a-teradata-connection-in-datahub/new-ingestion-source.png new file mode 100644 index 0000000000..24cfd9bfa0 Binary files /dev/null and b/quickstarts/other-integrations/images/configure-a-teradata-connection-in-datahub/new-ingestion-source.png differ diff --git a/quickstarts/other-integrations/images/configure-a-teradata-connection-in-datahub/schema.png b/quickstarts/other-integrations/images/configure-a-teradata-connection-in-datahub/schema.png new file mode 100644 index 0000000000..3e1b374a66 Binary files /dev/null and b/quickstarts/other-integrations/images/configure-a-teradata-connection-in-datahub/schema.png differ diff --git a/quickstarts/other-integrations/images/configure-a-teradata-connection-in-datahub/select-other-source.png b/quickstarts/other-integrations/images/configure-a-teradata-connection-in-datahub/select-other-source.png new file mode 100644 index 0000000000..9663a52475 Binary files /dev/null and b/quickstarts/other-integrations/images/configure-a-teradata-connection-in-datahub/select-other-source.png differ diff --git a/quickstarts/other-integrations/images/configure-a-teradata-connection-in-datahub/set-schedule.png b/quickstarts/other-integrations/images/configure-a-teradata-connection-in-datahub/set-schedule.png new file mode 100644 index 0000000000..dbea9babf4 Binary files /dev/null and b/quickstarts/other-integrations/images/configure-a-teradata-connection-in-datahub/set-schedule.png differ diff --git a/quickstarts/other-integrations/images/configure-a-teradata-connection-in-dbeaver/configure-connection.png b/quickstarts/other-integrations/images/configure-a-teradata-connection-in-dbeaver/configure-connection.png new file mode 100644 index 0000000000..df14883bda Binary files /dev/null and b/quickstarts/other-integrations/images/configure-a-teradata-connection-in-dbeaver/configure-connection.png differ diff --git a/quickstarts/other-integrations/images/configure-a-teradata-connection-in-dbeaver/configure-driver-string.png b/quickstarts/other-integrations/images/configure-a-teradata-connection-in-dbeaver/configure-driver-string.png new file mode 100644 index 0000000000..b073165776 Binary files /dev/null and b/quickstarts/other-integrations/images/configure-a-teradata-connection-in-dbeaver/configure-driver-string.png differ diff --git a/quickstarts/other-integrations/images/configure-a-teradata-connection-in-dbeaver/copy-driver.png b/quickstarts/other-integrations/images/configure-a-teradata-connection-in-dbeaver/copy-driver.png new file mode 100644 index 0000000000..e73277ed90 Binary files /dev/null and b/quickstarts/other-integrations/images/configure-a-teradata-connection-in-dbeaver/copy-driver.png differ diff --git a/quickstarts/other-integrations/images/configure-a-teradata-connection-in-dbeaver/create-connection.png b/quickstarts/other-integrations/images/configure-a-teradata-connection-in-dbeaver/create-connection.png new file mode 100644 index 0000000000..f894a5cca2 Binary files /dev/null and b/quickstarts/other-integrations/images/configure-a-teradata-connection-in-dbeaver/create-connection.png differ diff --git a/quickstarts/other-integrations/images/configure-a-teradata-connection-in-dbeaver/plug-icon.png b/quickstarts/other-integrations/images/configure-a-teradata-connection-in-dbeaver/plug-icon.png new file mode 100644 index 0000000000..fa3901a781 Binary files /dev/null and b/quickstarts/other-integrations/images/configure-a-teradata-connection-in-dbeaver/plug-icon.png differ diff --git a/quickstarts/other-integrations/images/configure-a-teradata-connection-in-dbeaver/select-your-database-windows.png b/quickstarts/other-integrations/images/configure-a-teradata-connection-in-dbeaver/select-your-database-windows.png new file mode 100644 index 0000000000..d84ca6de29 Binary files /dev/null and b/quickstarts/other-integrations/images/configure-a-teradata-connection-in-dbeaver/select-your-database-windows.png differ diff --git a/quickstarts/other-integrations/images/configure-a-teradata-connection-in-dbeaver/select-your-database.png b/quickstarts/other-integrations/images/configure-a-teradata-connection-in-dbeaver/select-your-database.png new file mode 100644 index 0000000000..db45329fe9 Binary files /dev/null and b/quickstarts/other-integrations/images/configure-a-teradata-connection-in-dbeaver/select-your-database.png differ diff --git a/quickstarts/other-integrations/images/configure-a-teradata-connection-in-dbeaver/teradata-connection-settings-ssh-windows.png b/quickstarts/other-integrations/images/configure-a-teradata-connection-in-dbeaver/teradata-connection-settings-ssh-windows.png new file mode 100644 index 0000000000..848d502e0e Binary files /dev/null and b/quickstarts/other-integrations/images/configure-a-teradata-connection-in-dbeaver/teradata-connection-settings-ssh-windows.png differ diff --git a/quickstarts/other-integrations/images/configure-a-teradata-connection-in-dbeaver/teradata-connection-settings-ssh.png b/quickstarts/other-integrations/images/configure-a-teradata-connection-in-dbeaver/teradata-connection-settings-ssh.png new file mode 100644 index 0000000000..61d75adc0b Binary files /dev/null and b/quickstarts/other-integrations/images/configure-a-teradata-connection-in-dbeaver/teradata-connection-settings-ssh.png differ diff --git a/quickstarts/other-integrations/images/configure-a-teradata-connection-in-dbeaver/teradata-connection-settings-windows-ldap.png b/quickstarts/other-integrations/images/configure-a-teradata-connection-in-dbeaver/teradata-connection-settings-windows-ldap.png new file mode 100644 index 0000000000..e8c6495c42 Binary files /dev/null and b/quickstarts/other-integrations/images/configure-a-teradata-connection-in-dbeaver/teradata-connection-settings-windows-ldap.png differ diff --git a/quickstarts/other-integrations/images/configure-a-teradata-connection-in-dbeaver/teradata-connection-settings-windows.png b/quickstarts/other-integrations/images/configure-a-teradata-connection-in-dbeaver/teradata-connection-settings-windows.png new file mode 100644 index 0000000000..7283e06f76 Binary files /dev/null and b/quickstarts/other-integrations/images/configure-a-teradata-connection-in-dbeaver/teradata-connection-settings-windows.png differ diff --git a/quickstarts/other-integrations/images/configure-a-teradata-connection-in-dbeaver/teradata-connection-settings.png b/quickstarts/other-integrations/images/configure-a-teradata-connection-in-dbeaver/teradata-connection-settings.png new file mode 100644 index 0000000000..1b09bf6473 Binary files /dev/null and b/quickstarts/other-integrations/images/configure-a-teradata-connection-in-dbeaver/teradata-connection-settings.png differ diff --git a/quickstarts/other-integrations/images/execute-airflow-workflows-that-use-dbt-with-teradata-vantage/admin-dropdown.png b/quickstarts/other-integrations/images/execute-airflow-workflows-that-use-dbt-with-teradata-vantage/admin-dropdown.png new file mode 100644 index 0000000000..09dd180dc3 Binary files /dev/null and b/quickstarts/other-integrations/images/execute-airflow-workflows-that-use-dbt-with-teradata-vantage/admin-dropdown.png differ diff --git a/quickstarts/other-integrations/images/execute-airflow-workflows-that-use-dbt-with-teradata-vantage/import-variables.png b/quickstarts/other-integrations/images/execute-airflow-workflows-that-use-dbt-with-teradata-vantage/import-variables.png new file mode 100644 index 0000000000..7cc50d83a4 Binary files /dev/null and b/quickstarts/other-integrations/images/execute-airflow-workflows-that-use-dbt-with-teradata-vantage/import-variables.png differ diff --git a/quickstarts/other-integrations/images/getting-started-dbt-feast-teradata-pipeline/dbt-feast.png b/quickstarts/other-integrations/images/getting-started-dbt-feast-teradata-pipeline/dbt-feast.png new file mode 100644 index 0000000000..dd7d8a8d60 Binary files /dev/null and b/quickstarts/other-integrations/images/getting-started-dbt-feast-teradata-pipeline/dbt-feast.png differ diff --git a/quickstarts/other-integrations/images/integrate-teradata-vantage-with-knime/add-jar.png b/quickstarts/other-integrations/images/integrate-teradata-vantage-with-knime/add-jar.png new file mode 100644 index 0000000000..6de7092bce Binary files /dev/null and b/quickstarts/other-integrations/images/integrate-teradata-vantage-with-knime/add-jar.png differ diff --git a/quickstarts/other-integrations/images/integrate-teradata-vantage-with-knime/apply-and-close.png b/quickstarts/other-integrations/images/integrate-teradata-vantage-with-knime/apply-and-close.png new file mode 100644 index 0000000000..41ddfff79b Binary files /dev/null and b/quickstarts/other-integrations/images/integrate-teradata-vantage-with-knime/apply-and-close.png differ diff --git a/quickstarts/other-integrations/images/integrate-teradata-vantage-with-knime/enter-configuration.png b/quickstarts/other-integrations/images/integrate-teradata-vantage-with-knime/enter-configuration.png new file mode 100644 index 0000000000..4028ae692e Binary files /dev/null and b/quickstarts/other-integrations/images/integrate-teradata-vantage-with-knime/enter-configuration.png differ diff --git a/quickstarts/other-integrations/images/integrate-teradata-vantage-with-knime/execute-node.png b/quickstarts/other-integrations/images/integrate-teradata-vantage-with-knime/execute-node.png new file mode 100644 index 0000000000..e1c0725ce2 Binary files /dev/null and b/quickstarts/other-integrations/images/integrate-teradata-vantage-with-knime/execute-node.png differ diff --git a/quickstarts/other-integrations/images/integrate-teradata-vantage-with-knime/register-driver.png b/quickstarts/other-integrations/images/integrate-teradata-vantage-with-knime/register-driver.png new file mode 100644 index 0000000000..d57bb6f83c Binary files /dev/null and b/quickstarts/other-integrations/images/integrate-teradata-vantage-with-knime/register-driver.png differ diff --git a/quickstarts/other-integrations/images/integrate-teradata-vantage-with-knime/start-configuration.png b/quickstarts/other-integrations/images/integrate-teradata-vantage-with-knime/start-configuration.png new file mode 100644 index 0000000000..e409f70d2a Binary files /dev/null and b/quickstarts/other-integrations/images/integrate-teradata-vantage-with-knime/start-configuration.png differ diff --git a/quickstarts/other-integrations/images/integrate-teradata-vantage-with-knime/test-connection-1.png b/quickstarts/other-integrations/images/integrate-teradata-vantage-with-knime/test-connection-1.png new file mode 100644 index 0000000000..a87fc46354 Binary files /dev/null and b/quickstarts/other-integrations/images/integrate-teradata-vantage-with-knime/test-connection-1.png differ diff --git a/quickstarts/other-integrations/images/integrate-teradata-vantage-with-knime/test-connection-2.png b/quickstarts/other-integrations/images/integrate-teradata-vantage-with-knime/test-connection-2.png new file mode 100644 index 0000000000..51973412b0 Binary files /dev/null and b/quickstarts/other-integrations/images/integrate-teradata-vantage-with-knime/test-connection-2.png differ diff --git a/quickstarts/other-integrations/images/integrate-teradata-vantage-with-knime/test-connection-apply.png b/quickstarts/other-integrations/images/integrate-teradata-vantage-with-knime/test-connection-apply.png new file mode 100644 index 0000000000..7d11dcfece Binary files /dev/null and b/quickstarts/other-integrations/images/integrate-teradata-vantage-with-knime/test-connection-apply.png differ diff --git a/quickstarts/other-integrations/images/integrate-teradata-vantage-with-knime/view-results-final.png b/quickstarts/other-integrations/images/integrate-teradata-vantage-with-knime/view-results-final.png new file mode 100644 index 0000000000..add30b8e51 Binary files /dev/null and b/quickstarts/other-integrations/images/integrate-teradata-vantage-with-knime/view-results-final.png differ diff --git a/quickstarts/other-integrations/images/integrate-teradata-vantage-with-knime/view-results.png b/quickstarts/other-integrations/images/integrate-teradata-vantage-with-knime/view-results.png new file mode 100644 index 0000000000..9456d5d2de Binary files /dev/null and b/quickstarts/other-integrations/images/integrate-teradata-vantage-with-knime/view-results.png differ diff --git a/quickstarts/vantagecloud-lake/_category_.json b/quickstarts/vantagecloud-lake/_category_.json new file mode 100644 index 0000000000..58fbcda1af --- /dev/null +++ b/quickstarts/vantagecloud-lake/_category_.json @@ -0,0 +1,4 @@ +{ + "label": "VantageCloud Lake", + "position": 7 + } \ No newline at end of file diff --git a/quickstarts/vantagecloud-lake/_partials/vantagecloud-lake-request.md b/quickstarts/vantagecloud-lake/_partials/vantagecloud-lake-request.md new file mode 100644 index 0000000000..b46a2ed711 --- /dev/null +++ b/quickstarts/vantagecloud-lake/_partials/vantagecloud-lake-request.md @@ -0,0 +1,3 @@ +:::tip +To request a VantageCloud Lake environment, refer to the form provided in this [link](https://www.teradata.com/about-us/contact). If you already have a VantageCloud Lake environment and seek guidance on configuration, please consult this [guide](https://quickstarts.teradata.com/getting-started-with-vantagecloud-lake.html). +::: \ No newline at end of file diff --git a/quickstarts/vantagecloud-lake/getting-started-with-vantagecloud-lake.md b/quickstarts/vantagecloud-lake/getting-started-with-vantagecloud-lake.md new file mode 100644 index 0000000000..ba6d19bafc --- /dev/null +++ b/quickstarts/vantagecloud-lake/getting-started-with-vantagecloud-lake.md @@ -0,0 +1,266 @@ +--- +sidebar_position: 1 +author: Vidhan Bhonsle +email: vidhan.bhonsle@teradata.com +page_last_update: January 2nd, 2024 +description: Create your own environment in VantageCloud Lake +keywords: [data warehouses, compute storage separation, teradata, vantage, cloud data platform, business intelligence, enterprise analytics, jupyter, teradatasql, ipython-sql, teradatasqlalchemy, vantagecloud, vantagecloud lake, ip address, public internet, lake] +--- +# Getting Started with VantageCloud Lake + +## Overview + +Teradata VantageCloud Lake is Teradata's next-generation, cloud-native analytics and data platform. It provides lakehouse deployment patterns along with the ability to run independent elastic workloads using an object storage-centric design. + +It empowers organizations to unlock their data, activate analytics, and accelerate value. Customers can optimize their analytics environment using specially configured compute cluster resources that best accommodate their workload requirements. + + +![VantageCloud](../images/VantageCloud.png) + + +VantageCloud Lake provides all the benefits you’d expect in a cloud solution plus Teradata’s differentiated technology stack, including the industry-leading Analytics Database, ClearScape Analytics, and QueryGrid data fabric. + +## Sign-on to VantageCloud Lake + +IMPORTANT: To get a VantageCloud Lake sign-on link and credentials, fill the [contact form](https://www.teradata.com/about-us/contact) to reach out to Teradata team. + +Go to the URL provided by Teradata, for example, **_ourcompany.innovationlabs.teradata.com_** and sign on: + +* Existing customers can use their organization admin username(email address) and password to sign on. +* New customer can sign on with their organization admin username (from welcome letter: email address) and the password you created. + +:::note +Click [here](https://login.customer.teradata.com/ext/pwdreset/Identify?AdapterId=CDSCustomer) to reset the organization admin password. +::: + +![Sign On](../images/lake_sign_on.png) + +The signing on takes you to VantageCloud Lake welcome page. + +![Welcome Page](../images/lake_welcome_page.png) + +The Welcome page has navigation menu that not only gives you a complete control of your environments but also provides you various necessary tools: + +![Navigation Menu Items](../images/lake_expanded_menu.png) + +* Vantage - Home page of VantageCloud Lake portal +* [Environments](https://docs.teradata.com/r/Teradata-VantageCloud-Lake/Getting-Started-First-Sign-On-by-Organization-Admin/Step-1-Signing-On-and-Creating-Your-First-Environment) - Create your environments and see all the created environments +* [Organization](https://docs.teradata.com/r/Teradata-VantageCloud-Lake/Introduction-to-VantageCloud-Lake/VantageCloud-Lake-Organizations-and-Environments) - View organizations configuration, manage Organization Admins and view the configuration and status of your account +* [Consumption](https://docs.teradata.com/r/Teradata-VantageCloud-Lake/Managing-Compute-Resources/Review-Consumption-Usage) - Monitor how your organization consumes compute and storage resources +* [Cost Calculator](https://docs.teradata.com/r/Teradata-VantageCloud-Lake/Using-VantageCloud-Lake-Console-to-Manage-VantageCloud-Lake/Using-the-Consumption-Estimates) - Calculate the cost and consumption of your environment and whole organization. +* [Queries](https://docs.teradata.com/r/Teradata-VantageCloud-Lake/Running-and-Monitoring-Queries/Monitoring-and-Managing-Queries) - Inspect an environment's queries to understand their efficiency. +* [Editor](https://docs.teradata.com/r/Teradata-VantageCloud-Lake/Running-and-Monitoring-Queries) - Create and run queries in an editor. +* [Data Copy](https://docs.teradata.com/r/Teradata-VantageCloud-Lake/Data-Copy) - Provision, configure and run data copy (also known as Data Mover) jobs from VantageCloud Lake Console. + + +## Create an Environment +To create a primary cluster environment, click on "Environments" on the navigation menu. In a new opened view, click on "Create" button situated on the top right of the page. + +![Environment Page](../images/lake_environment_page.png) + +### Environment configuration + +Fill out the Environment configuration fields: + + +| **Item** | **Description** | +|--------------------|---------------------------------------------------------------------------------| +| *Environment name* | A contextual name for new environment | +| *Region* | The available region list was predetermined during the sales process. | +| *Package* | There are dos service packages available to select from: | +| | - Lake: Premier 24x7 cloud support | +| | - Lake: Premier 24x7 Priority cloud support + industry data models | + + + +![Environment configuration](../images/lake_environment_configuration.png) + +:::important +The **Consumption estimates**, to your right, provide guidance for configuring the environment. See [Using the Consumption Estimates](https://docs.teradata.com/r/Teradata-VantageCloud-Lake/Using-VantageCloud-Lake-Console-to-Manage-VantageCloud-Lake/Using-the-Consumption-Estimates) for more detail. +::: + +### Primary cluster configuration + +Fill out the Primary cluster configuration fields: + + +```mdx-code-block + + + + + + + + + + + + + + + + + + + + + + +
ItemDescription
Instance size + Select an instance size suitable for your use-case:
+ Lake + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
SizeValue (in units)
XSmall2
Small4
Medium7
Large10
XLarge13
2XLarge20
3XLarge27
+ Lake+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
SizeValue (in units)
XSmall2.4
Small4.8
Medium8.4
Large12
XLarge15.6
2XLarge24
3XLarge32.4
+
Instance count2 to 64
Number of nodes in the primary clusters
Instance storage1 to 72TB per instance
+ + +``` + + + +![Primary cluster configuration](../images/lake_primary_cluster_config.png) + +### Database credentials + +Fill out the Database credential fields: + +| **Item** | **Description** | +|--------------------|---------------------------------------------------------------------------------| +| DBC password | The primary administrative account in a Teradata Vantage environment is known as "dbc". Similar to the root user in Linux, the dbc account holds comprehensive administrative privileges. It is recommended to establish additional administrative users for routine tasks, after the environment is created, and to refrain from sharing or utilizing the dbc credentials. | + +Set the password for dbc: +* Between 8 and 64 characters +* Allows both alphanumeric and special characters +* No dictionary words + + +![Primary cluster configuration](../images/lake_database_cred.png) + +### Advanced options + +To quickly get started, you can select **Use Defaults** or define the additional option settings. + +![Advanced option with user default](../images/lake_advanced_option_default.png) + + + +```mdx-code-block +| *Item* |Description* | +|-------------|---------------------------------------------------------------------------------------------------| +| AMPs per instance | Workload management + Select the number of AMPs per instance for the instance size you selected. | +| AWS: Storage encryption | Configure encryption for customer data. See [Finding the key ID and key ARN](https://docs.aws.amazon.com/kms/latest/developerguide/find-cmk-id-arn.html)
  • Managed by Teradata
  • Customer managed
  • Key Alias ARN
| + +``` + +![Advanced option user defined](../images/lake_advanced_option.png) + +Review all the information and click on **CREATE ENVIRONMENT** button. + +![Create environment button](../images/lake_create_environment.png) + +The deployment takes few minutes. Once complete, created environment can be found in **Environments** section as a card view(name of environment is quickstart_demo). + +![Newly created available environment](../images/lake_available_environment.png) + +## Access environment from public internet + +The created environment is accessible through console only. To change that, click on created environment and go to *SETTINGS* tab. + +![Settings menu of created environment](../images/lake_settings_menu.png) + +In the **SETTINGS**, select the **Internet connection** checkbox and provide the IP addresses in CIDR format (for example, 192.168.2.0/24 specifies all IP addresses in the range: 192.168.2.0 to 192.168.2.255) with which you would want to access your environment. + +:::note +Find more information on setting up an internet connection [here](https://docs.teradata.com/r/Teradata-VantageCloud-Lake/Getting-Started-First-Sign-On-by-Organization-Admin/Step-2-Setting-the-Environment-Connection-Type/Setting-Up-an-Internet-Connection). +::: + +![IP whitelisting](../images/lake_ip_addresses.png) + +Click on the **SAVE** button situated on right top of the page to confirm changes. + +Go back to the **Environments** section and check your environment card. It has **Public internet** access now. + +![Public internet card view](../images/lake_public_internet_cv.png) + + +## Summary + +In this quick start we learned how to create an environment in VantageCloud Lake and allow it to be accessed from public internet. + +## Further reading + +* [Teradata VantageCloud Lake documentation](https://docs.teradata.com/r/Teradata-VantageCloud-Lake/Getting-Started-First-Sign-On-by-Organization-Admin) \ No newline at end of file diff --git a/quickstarts/vantagecloud-lake/images/vantage-lake-demo-jupyter-google-cloud-vertex-ai/activenotebook.png b/quickstarts/vantagecloud-lake/images/vantage-lake-demo-jupyter-google-cloud-vertex-ai/activenotebook.png new file mode 100644 index 0000000000..e327051b01 Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantage-lake-demo-jupyter-google-cloud-vertex-ai/activenotebook.png differ diff --git a/quickstarts/vantagecloud-lake/images/vantage-lake-demo-jupyter-google-cloud-vertex-ai/bucket.png b/quickstarts/vantagecloud-lake/images/vantage-lake-demo-jupyter-google-cloud-vertex-ai/bucket.png new file mode 100644 index 0000000000..089aa12477 Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantage-lake-demo-jupyter-google-cloud-vertex-ai/bucket.png differ diff --git a/quickstarts/vantagecloud-lake/images/vantage-lake-demo-jupyter-google-cloud-vertex-ai/detailsenv.png b/quickstarts/vantagecloud-lake/images/vantage-lake-demo-jupyter-google-cloud-vertex-ai/detailsenv.png new file mode 100644 index 0000000000..bb11a0739f Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantage-lake-demo-jupyter-google-cloud-vertex-ai/detailsenv.png differ diff --git a/quickstarts/vantagecloud-lake/images/vantage-lake-demo-jupyter-google-cloud-vertex-ai/notebooklauncher.png b/quickstarts/vantagecloud-lake/images/vantage-lake-demo-jupyter-google-cloud-vertex-ai/notebooklauncher.png new file mode 100644 index 0000000000..d41fc0ea32 Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantage-lake-demo-jupyter-google-cloud-vertex-ai/notebooklauncher.png differ diff --git a/quickstarts/vantagecloud-lake/images/vantage-lake-demo-jupyter-google-cloud-vertex-ai/openvars.png b/quickstarts/vantagecloud-lake/images/vantage-lake-demo-jupyter-google-cloud-vertex-ai/openvars.png new file mode 100644 index 0000000000..65429c2e8f Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantage-lake-demo-jupyter-google-cloud-vertex-ai/openvars.png differ diff --git a/quickstarts/vantagecloud-lake/images/vantage-lake-demo-jupyter-google-cloud-vertex-ai/python3.png b/quickstarts/vantagecloud-lake/images/vantage-lake-demo-jupyter-google-cloud-vertex-ai/python3.png new file mode 100644 index 0000000000..a31bea4648 Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantage-lake-demo-jupyter-google-cloud-vertex-ai/python3.png differ diff --git a/quickstarts/vantagecloud-lake/images/vantage-lake-demo-jupyter-google-cloud-vertex-ai/startupscript.png b/quickstarts/vantagecloud-lake/images/vantage-lake-demo-jupyter-google-cloud-vertex-ai/startupscript.png new file mode 100644 index 0000000000..be698c82cd Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantage-lake-demo-jupyter-google-cloud-vertex-ai/startupscript.png differ diff --git a/quickstarts/vantagecloud-lake/images/vantage-lake-demo-jupyter-google-cloud-vertex-ai/upload.png b/quickstarts/vantagecloud-lake/images/vantage-lake-demo-jupyter-google-cloud-vertex-ai/upload.png new file mode 100644 index 0000000000..e8b2be6821 Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantage-lake-demo-jupyter-google-cloud-vertex-ai/upload.png differ diff --git a/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-azure/azure-app-service-1.PNG b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-azure/azure-app-service-1.PNG new file mode 100644 index 0000000000..4df3440c88 Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-azure/azure-app-service-1.PNG differ diff --git a/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-azure/azure-app-service-2.PNG b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-azure/azure-app-service-2.PNG new file mode 100644 index 0000000000..5a11561807 Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-azure/azure-app-service-2.PNG differ diff --git a/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-azure/azure-app-service-3.PNG b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-azure/azure-app-service-3.PNG new file mode 100644 index 0000000000..d36bfd2b20 Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-azure/azure-app-service-3.PNG differ diff --git a/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-azure/azure-app-service-4.PNG b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-azure/azure-app-service-4.PNG new file mode 100644 index 0000000000..8bef170cf7 Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-azure/azure-app-service-4.PNG differ diff --git a/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-azure/azure-app-service-complete-resource-8.PNG b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-azure/azure-app-service-complete-resource-8.PNG new file mode 100644 index 0000000000..ba1dff1d0e Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-azure/azure-app-service-complete-resource-8.PNG differ diff --git a/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-azure/azure-app-service-deployment-complete-5.PNG b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-azure/azure-app-service-deployment-complete-5.PNG new file mode 100644 index 0000000000..003497dd5c Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-azure/azure-app-service-deployment-complete-5.PNG differ diff --git a/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-azure/azure-app-service-ips-14.PNG b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-azure/azure-app-service-ips-14.PNG new file mode 100644 index 0000000000..4e2397292a Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-azure/azure-app-service-ips-14.PNG differ diff --git a/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-azure/azure-app-service-resource-6.PNG b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-azure/azure-app-service-resource-6.PNG new file mode 100644 index 0000000000..4d8caf0a98 Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-azure/azure-app-service-resource-6.PNG differ diff --git a/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-azure/azure-app-service-resource-8.PNG b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-azure/azure-app-service-resource-8.PNG new file mode 100644 index 0000000000..359e2442e2 Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-azure/azure-app-service-resource-8.PNG differ diff --git a/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-azure/azure-app-service-resource-config-7.PNG b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-azure/azure-app-service-resource-config-7.PNG new file mode 100644 index 0000000000..6fa03a15af Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-azure/azure-app-service-resource-config-7.PNG differ diff --git a/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-azure/azure-console-0.PNG b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-azure/azure-console-0.PNG new file mode 100644 index 0000000000..39c8954a0b Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-azure/azure-console-0.PNG differ diff --git a/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-azure/azure-jupyter-console-10.PNG b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-azure/azure-jupyter-console-10.PNG new file mode 100644 index 0000000000..1aa7e747d9 Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-azure/azure-jupyter-console-10.PNG differ diff --git a/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-azure/azure-jupyter-console-auth-9.PNG b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-azure/azure-jupyter-console-auth-9.PNG new file mode 100644 index 0000000000..26847838b5 Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-azure/azure-jupyter-console-auth-9.PNG differ diff --git a/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-azure/azure-jupyter-console-click-lake-demos-12.PNG b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-azure/azure-jupyter-console-click-lake-demos-12.PNG new file mode 100644 index 0000000000..44acac38ef Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-azure/azure-jupyter-console-click-lake-demos-12.PNG differ diff --git a/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-azure/azure-jupyter-console-clone-11.PNG b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-azure/azure-jupyter-console-clone-11.PNG new file mode 100644 index 0000000000..6f0be953f2 Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-azure/azure-jupyter-console-clone-11.PNG differ diff --git a/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-azure/azure-jupyter-console-lakedemos-13.PNG b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-azure/azure-jupyter-console-lakedemos-13.PNG new file mode 100644 index 0000000000..299e337df1 Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-azure/azure-jupyter-console-lakedemos-13.PNG differ diff --git a/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-docker/lake_0_setup.png b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-docker/lake_0_setup.png new file mode 100644 index 0000000000..ca61e10cc4 Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-docker/lake_0_setup.png differ diff --git a/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-docker/lake_docker_url.png b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-docker/lake_docker_url.png new file mode 100644 index 0000000000..1ba028504a Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-docker/lake_docker_url.png differ diff --git a/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-docker/lake_ip_addresses.png b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-docker/lake_ip_addresses.png new file mode 100644 index 0000000000..a018bc0f9f Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-docker/lake_ip_addresses.png differ diff --git a/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-docker/lake_jupyter_notebook.png b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-docker/lake_jupyter_notebook.png new file mode 100644 index 0000000000..1d038a5385 Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-docker/lake_jupyter_notebook.png differ diff --git a/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-docker/lake_overview_page.png b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-docker/lake_overview_page.png new file mode 100644 index 0000000000..1c44887974 Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-docker/lake_overview_page.png differ diff --git a/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-docker/lake_public_internet_cv.png b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-docker/lake_public_internet_cv.png new file mode 100644 index 0000000000..03078aaa3c Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-docker/lake_public_internet_cv.png differ diff --git a/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-bucket-upload.png b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-bucket-upload.png new file mode 100644 index 0000000000..58f2603901 Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-bucket-upload.png differ diff --git a/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-config-1.PNG b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-config-1.PNG new file mode 100644 index 0000000000..d9b2da7024 Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-config-1.PNG differ diff --git a/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-config-2.PNG b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-config-2.PNG new file mode 100644 index 0000000000..b3d8f7d00c Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-config-2.PNG differ diff --git a/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-create-loaded-env.PNG b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-create-loaded-env.PNG new file mode 100644 index 0000000000..91f806e40a Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-create-loaded-env.PNG differ diff --git a/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-create-notebook-1.PNG b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-create-notebook-1.PNG new file mode 100644 index 0000000000..c1356ef3ee Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-create-notebook-1.PNG differ diff --git a/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-create-notebook-2.PNG b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-create-notebook-2.PNG new file mode 100644 index 0000000000..c5ad1bd65d Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-create-notebook-2.PNG differ diff --git a/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-create-notebook-3.PNG b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-create-notebook-3.PNG new file mode 100644 index 0000000000..4bc009f9bd Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-create-notebook-3.PNG differ diff --git a/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-create-notebook-4.PNG b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-create-notebook-4.PNG new file mode 100644 index 0000000000..cb90b8d06f Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-create-notebook-4.PNG differ diff --git a/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-iam-role-0.PNG b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-iam-role-0.PNG new file mode 100644 index 0000000000..bf2b220e46 Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-iam-role-0.PNG differ diff --git a/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-iam-role-1.PNG b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-iam-role-1.PNG new file mode 100644 index 0000000000..034b1d6067 Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-iam-role-1.PNG differ diff --git a/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-iam-role-2.PNG b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-iam-role-2.PNG new file mode 100644 index 0000000000..19c50e7291 Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-iam-role-2.PNG differ diff --git a/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-iam-role-3.PNG b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-iam-role-3.PNG new file mode 100644 index 0000000000..48cc073a59 Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-iam-role-3.PNG differ diff --git a/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-lake.PNG b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-lake.PNG new file mode 100644 index 0000000000..39f2bb6eba Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-lake.PNG differ diff --git a/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-list-ip.PNG b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-list-ip.PNG new file mode 100644 index 0000000000..ca6c36848e Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-list-ip.PNG differ diff --git a/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-vars.PNG b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-vars.PNG new file mode 100644 index 0000000000..76dc6c7103 Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-vars.PNG differ diff --git a/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demos-visual-studio-code/demoenvsetup.png b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demos-visual-studio-code/demoenvsetup.png new file mode 100644 index 0000000000..5b9f5389e6 Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demos-visual-studio-code/demoenvsetup.png differ diff --git a/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demos-visual-studio-code/existing.kernel.png b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demos-visual-studio-code/existing.kernel.png new file mode 100644 index 0000000000..87e2e88d5e Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demos-visual-studio-code/existing.kernel.png differ diff --git a/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demos-visual-studio-code/python.kernel.png b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demos-visual-studio-code/python.kernel.png new file mode 100644 index 0000000000..0ba0fbbced Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demos-visual-studio-code/python.kernel.png differ diff --git a/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demos-visual-studio-code/replace.png b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demos-visual-studio-code/replace.png new file mode 100644 index 0000000000..ae13634397 Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demos-visual-studio-code/replace.png differ diff --git a/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demos-visual-studio-code/search.png b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demos-visual-studio-code/search.png new file mode 100644 index 0000000000..3a946c2414 Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demos-visual-studio-code/search.png differ diff --git a/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demos-visual-studio-code/select.kernel.png b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demos-visual-studio-code/select.kernel.png new file mode 100644 index 0000000000..fc8088b109 Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demos-visual-studio-code/select.kernel.png differ diff --git a/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demos-visual-studio-code/select.results.png b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demos-visual-studio-code/select.results.png new file mode 100644 index 0000000000..5ff0f624ea Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demos-visual-studio-code/select.results.png differ diff --git a/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demos-visual-studio-code/server.display.name.png b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demos-visual-studio-code/server.display.name.png new file mode 100644 index 0000000000..c279a4e095 Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demos-visual-studio-code/server.display.name.png differ diff --git a/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demos-visual-studio-code/server.password.png b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demos-visual-studio-code/server.password.png new file mode 100644 index 0000000000..b0f550888a Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demos-visual-studio-code/server.password.png differ diff --git a/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demos-visual-studio-code/server.url.png b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demos-visual-studio-code/server.url.png new file mode 100644 index 0000000000..19a1b52f82 Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demos-visual-studio-code/server.url.png differ diff --git a/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demos-visual-studio-code/terminal.png b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demos-visual-studio-code/terminal.png new file mode 100644 index 0000000000..52b01ca02d Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demos-visual-studio-code/terminal.png differ diff --git a/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demos-visual-studio-code/vscode.png b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demos-visual-studio-code/vscode.png new file mode 100644 index 0000000000..56dff8d9b9 Binary files /dev/null and b/quickstarts/vantagecloud-lake/images/vantagecloud-lake-demos-visual-studio-code/vscode.png differ diff --git a/quickstarts/vantagecloud-lake/vantagecloud-lake-demo-jupyter-azure.md b/quickstarts/vantagecloud-lake/vantagecloud-lake-demo-jupyter-azure.md new file mode 100644 index 0000000000..d4db6e950a --- /dev/null +++ b/quickstarts/vantagecloud-lake/vantagecloud-lake-demo-jupyter-azure.md @@ -0,0 +1,188 @@ +--- +sidebar_position: 6 +author: Daniel Herrera +email: daniel.herrera2@teradata.com +page_last_update: January 26th, 2024 +description: Run Teradata Jupyter Notebook Demos for VantageCloud Lake in Azure +keywords: [data warehouses, compute storage separation, teradata, vantage, cloud data platform, business intelligence, enterprise analytics, jupyter, teradatasql, ipython-sql, cloud computing, machine learning, sagemaker, vantagecloud, vantagecloud lake, lake] +--- + +# Run Teradata Jupyter Notebook Demos for VantageCloud Lake in Microsoft Azure + + +## Overview +This quickstart details the process for running the [Teradata Jupyter Notebook Demos for VantageCloud Lake](https://github.com/Teradata/lake-demos), on Microsoft Azure. + +## Prerequisites +* Access to a Microsoft Azure account +* Access to a VantageCloud Lake environment + +import VantageCloudLakeRequest from './_partials/vantagecloud-lake-request.md'; + + + +## Microsoft Azure setup +In this section we will cover in detail each of the steps below: + +* Create a Microsoft Azure Web App based on Teradata Jupyter Lab extensions Docker image +* Configure Jupyter Lab extensions Azure Web App +* Load Vantagecloud Lake demos to Jupyter Lab extensions Azure Web App +* Find the IP of the Jupyter Lab extensions Azure Web App + +### Create a Microsoft Azure Web App based on Teradata Jupyter Lab extensions Docker image +* Login to Microsoft Azure and click on "APP Services" + +![Azure console](./images/vantagecloud-lake-demo-jupyter-azure/azure-console-0.PNG) + +* In "App Services" click Web App + +![Create Azure web app](./images/vantagecloud-lake-demo-jupyter-azure/azure-app-service-1.PNG) + +* On the "Basics" tab: +** Select the appropriate resource group from the dropdown, or create a new one +** Enter a name for your web app. +** Select "Docker Container" in the "Publish" radio button options +** Select "Linux" as the operating system +** Select the appropriate region from the dropdown +** Select the appropriate App Service plan. If you don't have one, a new one will be created with default configurations +:::tip +For purposes of the VantageCloud Lake demo redundancy is not needed +** After completing this tab, click the "Docker" tab to continue +::: + +![Create Azure web app Basics](./images/vantagecloud-lake-demo-jupyter-azure/azure-app-service-2.PNG) + +* On the "Docker" tab: + * Select "Single Container" from the dropdown + * In the "Image Source" dropdown select "Docker Hub" + * In the "Access Type" dropdown select "Public" + * In "Image and tag" type `teradata/jupyterlab-extensions:latest` + :::tip + A startup command is not needed for this App Service + * Select the "Review + Create" tab to continue + ::: + +![Create Azure web app Docker](./images/vantagecloud-lake-demo-jupyter-azure/azure-app-service-3.PNG) + +* In the "Review + Create" tab click the "Create" button + +![Create Azure web app Review](./images/vantagecloud-lake-demo-jupyter-azure/azure-app-service-4.PNG) + +* When the deployment is complete click the "Go to Resource" button + +![Create Azure web app Complete](./images/vantagecloud-lake-demo-jupyter-azure/azure-app-service-deployment-complete-5.PNG) + +### Configure Jupyter Lab extensions Azure Web App +* Select Configuration on the right panel + +![Create Azure web app Complete](./images/vantagecloud-lake-demo-jupyter-azure/azure-app-service-resource-6.PNG) + +* Add the following Application Settings + + +| **Application Setting** | **Value** | +|--------------|-----------| +| *"accept_license"* | Y | +| *"WEBSITES_PORT"* | 8888 | +| *"JUPYTER_TOKEN"* | Define the Jupyter Lab access token that you would like to use. | + +:::info +If you don't include the "JUPYTER_TOKEN" configuration, the container will generate a new token and log it to the console. You will need to retrieve it from the application logs. If you include the "JUPYTER_TOKEN" configuration key but leave the value blank, the system will set the token as an empty string, resulting in an unprotected Jupyter Lab environment without any token security. +::: + +* Click on save, your app will be restarted + +![Config Azure web app](./images/vantagecloud-lake-demo-jupyter-azure/azure-app-service-resource-config-7.PNG) + +* Return to the Overview tab on the right panel + +### Load Vantagecloud Lake demos to Jupyter Lab extensions Azure Web App +* Click on Default domain + +![Config Azure web app](./images/vantagecloud-lake-demo-jupyter-azure/azure-app-service-resource-8.PNG) + +* On the Jupyter Lab start dialogue enter the defined Jupyter token and click Log in + +![Config Azure web app](./images/vantagecloud-lake-demo-jupyter-azure/azure-jupyter-console-auth-9.PNG) + +* On the Jupyter Lab console click on the git icon + +![Config Azure web app](./images/vantagecloud-lake-demo-jupyter-azure/azure-jupyter-console-10.PNG) + +* Copy the following URI in the corresponding field +** `https://github.com/Teradata/lake-demos.git` +* Click Clone + +![Config Azure web app](./images/vantagecloud-lake-demo-jupyter-azure/azure-jupyter-console-clone-11.PNG) + +* On the Jupyter Lab console click in the `lake-demos` folder + +![Config Azure web app](./images/vantagecloud-lake-demo-jupyter-azure/azure-jupyter-console-click-lake-demos-12.PNG) + +![Config Azure web app](./images/vantagecloud-lake-demo-jupyter-azure/azure-jupyter-console-lakedemos-13.PNG) + +### Find the IP of the Jupyter Lab extensions Azure Web App +* In JupyterLab open a notebook with Teradata Python kernel and run the following command to find your notebook instance's IP address. + + +``` python , id="lakedemos_azure_first_config", role="emits-gtm-events, content-editable" +import requests +def get_public_ip(): + try: + response = requests.get('https://api.ipify.org') + return response.text + except requests.RequestException as e: + return "Error: " + str(e) +my_public_ip = get_public_ip() +print("My Public IP is:", my_public_ip) +``` + +** The next step is whitelist this IP in your VantageCloud Lake environment to allow the connection +** This is for purposes of this guide and the notebook demos. For production environments, a more robust networking setting might be needed +** Azure App Service offers, as well, a list of all possible IP addresses that the service might expose. This is under the overview tab + +![Loaded JupyterLab](./images/vantagecloud-lake-demo-jupyter-azure/azure-app-service-ips-14.PNG) + +## VantageCloud Lake Configuration +* In the VantageCloud Lake environment, under settings, add the IP of your notebook instance +:::tip +A lake environment supports multiple address whitelisting +::: + +![Initiate JupyterLab](./images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-lake.PNG) + +## Jupyter Notebook Demos for VantageCloud Lake + +### Configurations +* [vars.json](https://github.com/Teradata/lake-demos/blob/main/vars.json) should be edited to match the configuration of your VantageCloud Lake environment + +![Initiate JupyterLab](./images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-vars.PNG) + +* Especifically the following values should be added + + +| **Variable** | **Value** | +|--------------|-----------| +| *"host"* | Public IP value from your VantageCloud Lake environment | +| *"UES_URI"* | Open Analytics from your VantageCloud Lake environment | +| *"dbc"* | The master password of your VantageCloud Lake environment. | + +* You'll see that in the sample vars.json, the passwords of all users are defaulted to "password", this is just for illustration purposes, you should change all of these password fields to strong passwords, secure them as necessary, and follow other password management best practices. + +:::info +Remember to change all passwords in the vars.json file. +::: + +## Run demos +Open and execute all the cells in *0_Demo_Environment_Setup.ipynb* to setup your environment. Followed by *1_Demo_Setup_Base_Data.ipynb* to load the base data required for demo. + +To learn more about the demo notebooks, go to [Teradata Lake demos](https://github.com/Teradata/lake-demos) page on GitHub. + +## Summary + +In this quick start we learned how to run Jupyter notebook demos for VantageCloud Lake in Microsoft Azure. + +## Further reading + +* [Teradata VantageCloud Lake documentation](https://docs.teradata.com/r/Teradata-VantageCloud-Lake/Getting-Started-First-Sign-On-by-Organization-Admin) +* [Use Vantage from a Jupyter notebook](https://quickstarts.teradata.com/jupyter.html) \ No newline at end of file diff --git a/quickstarts/vantagecloud-lake/vantagecloud-lake-demo-jupyter-docker.md b/quickstarts/vantagecloud-lake/vantagecloud-lake-demo-jupyter-docker.md new file mode 100644 index 0000000000..5b9c02d922 --- /dev/null +++ b/quickstarts/vantagecloud-lake/vantagecloud-lake-demo-jupyter-docker.md @@ -0,0 +1,117 @@ +--- +sidebar_position: 2 +author: Vidhan Bhonsle +email: vidhan.bhonsle@teradata.com +page_last_update: January 10th, 2024 +description: Run Teradata Jupyter Notebook Demos for VantageCloud Lake in Docker +keywords: [data warehouses, compute storage separation, teradata, vantage, cloud data platform, business intelligence, enterprise analytics, jupyter, teradatasql, ipython-sql, docker, container, vantagecloud, vantagecloud lake, lake] +--- + +import Tabs from '../_partials/tabsJupyterNotebook.mdx'; + +# Run Teradata Jupyter Notebook Demos for VantageCloud Lake in Docker + +## Overview +In this how-to we will go through the steps for connecting to Teradata VantageCloud Lake and run demos from a Jupyter notebook in Docker. + +## Prerequisites +* [Docker Desktop](https://www.docker.com/products/docker-desktop) installed +* [Git](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git) installed + * Required to download git repo from https://github.com/Teradata/lake-demos.git +* A Teradata VantageCloud Lake account login + * Organization URL and login details from Teradata welcome letter +* IDE of your choice + +## Create VantageCloud Lake environment +Follow the instructions from the [VantageCloud Lake getting started](https://quickstarts.teradata.com/getting-started-with-vantagecloud-lake.html) to create your own environment. +Once created, go to **SETTINGS** tab and provide your public IP address to [access the environment](https://quickstarts.teradata.com/getting-started-with-vantagecloud-lake.html#_access_environment_from_public_internet). + +:::note +You can find your IP address from [WhatIsMyIp.com](https://www.whatismyip.com) website. Take note of the IPv4 address. +::: + +![IP whitelisting](./images/vantagecloud-lake-demo-jupyter-docker/lake_ip_addresses.png) + +Your environment card should show **Public internet** access now. + +![Public internet card view](./images/vantagecloud-lake-demo-jupyter-docker/lake_public_internet_cv.png) + +From **OVERVIEW** tab, copy: + +* Public IP and +* Open Analytics Endpoint + +These values are required to access VantageCloud Lake from the Docker. + +![Environment Overview page](./images/vantagecloud-lake-demo-jupyter-docker/lake_overview_page.png) + +## Clone VantageCloud Lake Demo repository +Clone VantageCloud Lake Demo repository in your local machine: + +``` bash +git clone https://github.com/Teradata/lake-demos.git +cd lake-demos +``` + +The repository contains different files and folders, the important ones are: + +* Jupyter Notebooks + * [0_Demo_Environment_Setup.ipynb](https://github.com/Teradata/lake-demos/blob/main/0_Demo_Environment_Setup.ipynb) + * [1_Load_Base_Demo_Data.ipynb](https://github.com/Teradata/lake-demos/blob/main/1_Load_Base_Demo_Data.ipynb) + * [Data_Engineering_Exploration.ipynb](https://github.com/Teradata/lake-demos/blob/main/Data_Engineering_Exploration.ipynb) + * [Data_Science_OAF.ipynb](https://github.com/Teradata/lake-demos/blob/main/Data_Science_OAF.ipynb) + * [Demo_Admin.ipynb](https://github.com/Teradata/lake-demos/blob/main/Demo_Admin.ipynb) +* [vars.json file](https://github.com/Teradata/lake-demos/blob/main/vars.json) + +## Edit vars.json file +To connect Jupyter notebooks with VantageCloud Lake, you need to edit [vars.json file](https://github.com/Teradata/lake-demos/blob/main/vars.json) and provide: + + +| **Variable** | **Value** | +|--------------|-----------| +| *"host"* | Public IP value from *OVERVIEW* section (see above) | +| *"UES_URI"* | Open Analytics Endpoint value from *OVERVIEW* section (see above) | +| *"dbc"* | The master password of your VantageCloud Lake environment | + + +:::info +In the sample vars.json, the passwords of all users are defaulted to "password", this is just for illustration purposes. You should change all of these password fields to strong passwords, secure them as necessary, and follow other password management best practices. +::: + +## Mount files within Docker +To run VantageCloud Lake demos, we need the [Teradata Jupyter Extensions for Docker](https://hub.docker.com/r/teradata/jupyterlab-extensions). The extensions provide the SQL ipython kernel, utilities to manage connections to Teradata, and the database object explorer to make you productive while interacting with the Teradata database. + +:::info +Make sure that you are running all the commands in the same folder where you have cloned the demo repository. +::: + +Start a container and bind it to the existing lake-demos directory. Choose the appropriate command based on your operating system: + +:::note +For Windows, run the docker command in PowerShell. +::: + + + + +![docker logs](./images/vantagecloud-lake-demo-jupyter-docker/lake_docker_url.png) + +Click on the URL in docker logs to open Jupyter notebook in your browser. + +![Jupyter Notebook](./images/vantagecloud-lake-demo-jupyter-docker/lake_jupyter_notebook.png) + +## Run demos +Open and execute all the cells in *0_Demo_Environment_Setup.ipynb* to setup your environment, followed by **1_Demo_Setup_Base_Data.ipynb** to load the base data required for the demos. + + +![Environment setup Jupyter Notebook](./images/vantagecloud-lake-demo-jupyter-docker/lake_0_setup.png) + +To learn more about the demo notebooks, go to [Teradata Lake demos](https://github.com/Teradata/lake-demos) page on GitHub. + +## Summary + +In this quick start we learned how to run Teradata VantageCloud Lake demos from Jupyter Notebook in Docker. + +## Further reading + +* [Teradata VantageCloud Lake documentation](https://docs.teradata.com/r/Teradata-VantageCloud-Lake/Getting-Started-First-Sign-On-by-Organization-Admin) +* [Use Vantage from a Jupyter notebook](https://quickstarts.teradata.com/jupyter.html) diff --git a/quickstarts/vantagecloud-lake/vantagecloud-lake-demo-jupyter-google-cloud-vertex-ai.md b/quickstarts/vantagecloud-lake/vantagecloud-lake-demo-jupyter-google-cloud-vertex-ai.md new file mode 100644 index 0000000000..3933d8789b --- /dev/null +++ b/quickstarts/vantagecloud-lake/vantagecloud-lake-demo-jupyter-google-cloud-vertex-ai.md @@ -0,0 +1,152 @@ +--- +sidebar_position: 5 +author: Janeth Graziani +email: Janeth.graziani@teradata.com +page_last_update: January 24, 2024 +description: Run VantageCloud Lake Demos using Jupyter notebooks in Google Vertex AI Workbench. +keywords: [data warehouses, compute storage separation, teradata, vantage, cloud data platform, business intelligence, enterprise analytics, jupyter, teradatasql, ipython-sql, teradatasqlalchemy, vantagecloud, vantagecloud lake, data analytics, data science, vertex ai, google cloud] +--- + +# Run Teradata Jupyter Notebook Demos for VantageCloud Lake in Google Cloud Vertex AI + +## Overview +This quickstart explains how to run [Teradata Jupyter Notebook Demos for VantageCloud Lake](https://github.com/Teradata/lake-demos) on Vertex AI, the AI/ML platform for Google Cloud. + +## Prerequisites +* Teradata modules for Jupyter **Linux desktop version** (download [here](https://downloads.teradata.com/download/tools/vantage-modules-for-jupyter), registration required) +* Google Cloud account with [Vertex AI and Notebooks API](https://console.cloud.google.com/flows/enableapi?apiid=notebooks.googleapis.com,aiplatform.googleapis.com&redirect=https://console.cloud.google.com&_ga=2.180323111.284679914.1706204112-1996764819.1705688373) enabled +* Google cloud storage to store startup scripts and Teradata Jupyter extension package +* [Access to a VantageCloud Lake environment](https://quickstarts.teradata.com/getting-started-with-vantagecloud-lake.html) + +## Vertex AI Google Cloud environment setup + +When you create a new notebook instance, you can specify a startup script. This script, which runs only once after instance creation, will install the Teradata Jupyter extension package and clone a GitHub repository into the new user-managed notebooks instance. + +* Download Teradata Jupyter extensions package +- Visit [Vantage Modules for Jupyter page](https://downloads.teradata.com/download/tools/vantage-modules-for-jupyter) +- Sign in and download the Teradata Linux version of the package. + +* Create Google Cloud Storage Bucket + - Create a bucket with a name relevant to the project (e.g., teradata_jupyter). + - Ensure that the bucket name is globally unique. For instance, if the name teradata_jupyter has already been used, it will not be available for subsequent users. + +![New bucket](./images/vantage-lake-demo-jupyter-google-cloud-vertex-ai/bucket.png) + +* Upload the unizzped Jupyter extension package to your Google Cloud Storage bucket as a file. + +* Write the following startup script and save it as `startup.sh` to your local machine. + +Below is an example script that retrieves the Teradata Jupyter extension package from Google Cloud Storage bucket and installs Teradata SQL kernel, extensions and clones the lake-demos repository. + +:::info +Remember to replace teradata_jupyter in the gsutil cp command. +::: + +``` bash , id="vertex_ex_script", role="content-editable, emits-gtm-events" +#! /bin/bash + +cd /home/jupyter +mkdir teradata +cd teradata +gsutil cp gs://teradata_jupyter/* . +unzip teradatasql*.zip + +# Install Teradata kernel +cp teradatakernel /usr/local/bin + +jupyter kernelspec install ./teradatasql --prefix=/opt/conda + +# Install Teradata extensions +pip install --find-links . teradata_preferences_prebuilt +pip install --find-links . teradata_connection_manager_prebuilt +pip install --find-links . teradata_sqlhighlighter_prebuilt +pip install --find-links . teradata_resultset_renderer_prebuilt +pip install --find-links . teradata_database_explorer_prebuilt + +# PIP install the Teradata Python library +pip install teradataml==17.20.00.04 + +# Install Teradata R library (optional, uncomment this line only if you use an environment that supports R) +#Rscript -e "install.packages('tdplyr',repos=c('https://r-repo.teradata.com','https://cloud.r-project.org'))" + +# Clone the Teradata lake-demos repository +su - jupyter -c "git clone https://github.com/Teradata/lake-demos.git" +``` +* Upload this script to your Google Cloud storage bucket as a file + +![files uploaded to bucket](./images/vantage-lake-demo-jupyter-google-cloud-vertex-ai/upload.png) + +### Initiating a user managed notebook instance + +* Access Vertex AI Workbench + +- Return to Vertex AI Workbench in Google Cloud console. +- Create a new User-Managed Notebook via Advanced Options or directly at https://notebook.new/. + +* Under Details, name your notebook, select your region and select continue. + +![notebook env details](./images/vantage-lake-demo-jupyter-google-cloud-vertex-ai/detailsenv.png) + +* Under Environment select **Browse** to select your startup.sh script from your Google Cloud Bucket. + +![select startup script](./images/vantage-lake-demo-jupyter-google-cloud-vertex-ai/startupscript.png) + +* Select Create to initiate the notebook. It may take a few minutes for the notebook creation process to complete. When it is done, click on OPEN JUPYTERLAB. + +![active notebook](./images/vantage-lake-demo-jupyter-google-cloud-vertex-ai/activenotebook.png) + +:::info +You will have to whitelist this IP in your VantageCloud Lake environment to allow the connection. This solution is appropriate in a trial environment. For production environments, a configuration of VPCs, Subnets, and Security Groups might need to be configured and whitelisted. +::: + +* On JupyterLab open a notebook with a Python kernel and run the following command for finding your notebook instance IP address. + +![python3 kernel](./images/vantage-lake-demo-jupyter-google-cloud-vertex-ai/python3.png) + +``` python , role="content-editable" +import requests +def get_public_ip(): + try: + response = requests.get('https://api.ipify.org') + return response.text + except requests.RequestException as e: + return "Error: " + str(e) +my_public_ip = get_public_ip() +print("My Public IP is:", my_public_ip) +``` + +## VantageCloud Lake Configuration +* In the VantageCloud Lake environment, under settings, add the IP of your notebook instance + +![Initiate JupyterLab](./images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-lake.PNG) + +## Edit vars.json +Navigate into the `lake-demos` directory in your notebook. + +![notebook launcher](./images/vantage-lake-demo-jupyter-google-cloud-vertex-ai/notebooklauncher.png) + +Right click on vars.json to open the file with editor. + +![vars.json](./images/vantage-lake-demo-jupyter-google-cloud-vertex-ai/openvars.png) + +Edit the *[vars.json file](https://github.com/Teradata/lake-demos/blob/main/vars.json)* file to include the required credentials to run the demos + +|*Variable* | *Value*| +|---------|----------| +|*"host"* | Public IP value from your VantageCloud Lake environment | +| *"UES_URI"*| Open Analytics from your VantageCloud Lake environment | +| *"dbc"*| The master password of your VantageCloud Lake environment. | + +To retrieve a Public IP address and Open Analytics Endpoint follow these [instructions](https://quickstarts.teradata.com/vantagecloud-lake/vantagecloud-lake-demo-jupyter-docker.html#_create_vantagecloud_lake_environment). + +:::info +Change passwords in the vars.json file.You'll see that in the sample vars.json, the passwords of all users are defaulted to "password", this is just for matters of the sample file, you should change all of these password fields to strong passwords, secure them as necessary and follow other password management best practices +::: + +## Run demos +Execute all the cells in *0_Demo_Environment_Setup.ipynb* to setup your environment. Followed by *1_Demo_Setup_Base_Data.ipynb* to load the base data required for demo. + +To learn more about the demo notebooks, go to [Teradata Lake demos](https://github.com/Teradata/lake-demos) page on GitHub. + +## Summary +In this quickstart guide, we configured Google Cloud Vertex AI Workbench Notebooks to run [Teradata Jupyter Notebook Demos for VantageCloud Lake](https://github.com/Teradata/lake-demos). \ No newline at end of file diff --git a/quickstarts/vantagecloud-lake/vantagecloud-lake-demo-jupyter-sagemaker.md b/quickstarts/vantagecloud-lake/vantagecloud-lake-demo-jupyter-sagemaker.md new file mode 100644 index 0000000000..5cd7365e32 --- /dev/null +++ b/quickstarts/vantagecloud-lake/vantagecloud-lake-demo-jupyter-sagemaker.md @@ -0,0 +1,211 @@ +--- +sidebar_position: 4 +author: Daniel Herrera +email: daniel.herrera2@teradata.com +page_last_update: January 16th, 2024 +description: Run Teradata Jupyter Notebook Demos for VantageCloud Lake in SageMaker +keywords: [data warehouses, compute storage separation, teradata, vantage, cloud data platform, business intelligence, enterprise analytics, jupyter, teradatasql, ipython-sql, cloud computing, machine learning, sagemaker, vantagecloud, vantagecloud lake, lake] +--- + +# Run Teradata Jupyter Notebook Demos for VantageCloud Lake in Amazon SageMaker + +## Overview +This quickstart details the process for running the [Teradata Jupyter Notebook Demos for VantageCloud Lake](https://github.com/Teradata/lake-demos), on Amazon SageMaker, the AI/ML platform from AWS. + +## Prerequisites +* Teradata modules for Jupyter (download [here](https://downloads.teradata.com/download/tools/vantage-modules-for-jupyter), registration required) +* AWS account with access to S3 and SageMaker +* [Access to a VantageCloud Lake environment](https://quickstarts.teradata.com/getting-started-with-vantagecloud-lake.html) + +## AWS environment set-up +In this section we will cover in detail each of the steps below: + +* Upload the Teradata modules for Jupyter to a S3 bucket +* Create an IAM role for your Jupyter notebook instance +* Create a lifecycle configuration for your Jupyter notebook instance +* Create Jupyter notebook instance +* Find the IP CIDR of your Jupyter notebook instance + +### Upload the Teradata modules for Jupyter to an S3 bucket +* On AWS S3 create a bucket and keep note of the assigned name +* Default options are appropiate for this bucket +* In the created bucket upload the Teradata modules for Jupyter + +![Load modules in S3 bucket](./images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-bucket-upload.png) + +### Create an IAM role for your Jupyter Notebooks instance +* On SageMaker navigate to the role manager + +![New role creation](./images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-iam-role-0.PNG) +* Create a new role (if not already defined) +* For purposes of this guide the role created is assigned the data scientist persona + +![Role name and persona](./images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-iam-role-1.PNG) +* On the settings, it is appropiate to keep the defaults +* In the corresponding screen define the bucket where you uploaded the Teradata Jupyter modules + +![S3 bucket](./images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-iam-role-2.PNG) +* In the next configuration we add the corresponding policies for access to the S3 bucket + +![S3 bucket permissions](./images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-iam-role-3.PNG) + +### Create lifecycle configuration for your Jupyter Notebooks instance +* On SageMaker navigate to lifecycle configurations and click on create + +![Create lifecycle configuration](./images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-config-1.PNG) +* Define a lifecycle configuration with the following scripts +** When working from a Windows environment, we recommend copying the scripts into the lifecycle configuration editor line by line. Press 'Enter' after each line directly in the editor to avoid copying issues. This approach helps prevent carriage return errors that can occur due to encoding differences between Windows and Linux. Such errors often manifest as "/bin/bash^M: bad interpreter" and can disrupt script execution. + +![Create lifecycle configuration](./images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-config-2.PNG) + + * On create script: + +``` bash , id="sagemaker_first_config", role="content-editable emits-gtm-events" +#!/bin/bash + +set -e + +# This script installs a custom, persistent installation of conda on the Notebook Instance's EBS volume, and ensures +# that these custom environments are available as kernels in Jupyter. + + +sudo -u ec2-user -i <<'EOF' +unset SUDO_UID +# Install a separate conda installation via Miniconda +WORKING_DIR=/home/ec2-user/SageMaker/custom-miniconda +mkdir -p "$WORKING_DIR" +wget https://repo.anaconda.com/miniconda/Miniconda3-4.6.14-Linux-x86_64.sh -O "$WORKING_DIR/miniconda.sh" +bash "$WORKING_DIR/miniconda.sh" -b -u -p "$WORKING_DIR/miniconda" +rm -rf "$WORKING_DIR/miniconda.sh" +# Create a custom conda environment +source "$WORKING_DIR/miniconda/bin/activate" +KERNEL_NAME="teradatasql" + +PYTHON="3.8" +conda create --yes --name "$KERNEL_NAME" python="$PYTHON" +conda activate "$KERNEL_NAME" +pip install --quiet ipykernel + +EOF +``` + + * On start script (In this script substitute name of your bucket and confirm version of Jupyter modules) + +``` bash , role="content-editable emits-gtm-events" +#!/bin/bash + +set -e + +# This script installs Teradata Jupyter kernel and extensions. + + +sudo -u ec2-user -i <<'EOF' +unset SUDO_UID + +WORKING_DIR=/home/ec2-user/SageMaker/custom-miniconda + +source "$WORKING_DIR/miniconda/bin/activate" teradatasql + +# Install teradatasql, teradataml, and pandas in the teradatasql environment +pip install teradataml +pip install pandas + +# fetch Teradata Jupyter extensions package from S3 and unzip it +mkdir -p "$WORKING_DIR/teradata" +aws s3 cp s3://resources-jp-extensions/teradatasqllinux_3.4.1-d05242023.zip "$WORKING_DIR/teradata" +cd "$WORKING_DIR/teradata" +unzip -o teradatasqllinux_3.4.1-d05242023 +cp teradatakernel /home/ec2-user/anaconda3/condabin +jupyter kernelspec install --user ./teradatasql +source /home/ec2-user/anaconda3/bin/activate JupyterSystemEnv + +# Install other Teradata-related packages +pip install teradata_connection_manager_prebuilt-3.4.1.tar.gz +pip install teradata_database_explorer_prebuilt-3.4.1.tar.gz +pip install teradata_preferences_prebuilt-3.4.1.tar.gz +pip install teradata_resultset_renderer_prebuilt-3.4.1.tar.gz +pip install teradata_sqlhighlighter_prebuilt-3.4.1.tar.gz + +conda deactivate +EOF +``` + +### Create Jupyter Notebooks instance +* On SageMaker navigate Notebooks, Notebook instances, create notebook instance +* Choose a name for your notebook instance, define size (for demos the smaller available instance is enough) +* Click in additional configurations and assign the recently created lifecycle configuration + +![Create notebook instance](./images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-create-notebook-1.PNG) +* Click in additional configurations and assign the recently created lifecycle configuration +* Assign the recently created IAM role to the notebook instance + +![Assign IAM role to notebook instance](./images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-create-notebook-2.PNG) + +* Paste the following link https://github.com/Teradata/lake-demos as the default github repository for the notebook instance + +![Assign default repository for the notebook instance](./images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-create-notebook-3.PNG) + +## Find the IP CIDR of your Jupyter Notebooks instance +* Once the instance is running click on open JupyterLab + +![Initiate JupyterLab](./images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-create-notebook-4.PNG) + +![Loaded JupyterLab](./images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-create-loaded-env.PNG) + +* On JupyterLab open a notebook with Teradata Python kernel and run the following command for finding your notebook instance IP address. +** We will whitelist this IP in your VantageCloud Lake environment in order to allow the connection. +** This is for purposes of this guide and the notebooks demos. For production environments, a configuration of VPCs, Subnets and Security Groups might need to be configured and whitelisted. + +``` python , role="content-editable" +import requests +def get_public_ip(): + try: + response # requests.get('https://api.ipify.org') + return response.text + except requests.RequestException as e: + return "Error: " + str(e) +my_public_ip = get_public_ip() +print("My Public IP is:", my_public_ip) +``` + +## VantageCloud Lake Configuration +* In the VantageCloud Lake environment, under settings, add the IP of your notebook instance + +![Initiate JupyterLab](./images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-lake.PNG) + +## Jupyter Notebook Demos for VantageCloud Lake + +### Configurations +* The file [vars.json file](https://github.com/Teradata/lake-demos/blob/main/vars.json) should be edited to match the configuration of your VantageCloud Lake environment + +![Initiate JupyterLab](./images/vantagecloud-lake-demo-jupyter-sagemaker/sagemaker-vars.PNG) + +* Especifically the following values should be added + + +| *Variable* | *Value*| +|------------|--------| +|*"host"*|Public IP value from your VantageCloud Lake environment| +|*"UES_URI"* |Open Analytics from your VantageCloud Lake environment| +|*"dbc"*|The master password of your VantageCloud Lake environment| + + +:::info +Remember to change all passwords in the vars.json file. +::: + +* You'll see that in the sample vars.json, the passwords of all users are defaulted to "password", this is just for illustration purposes, you should change all of these password fields to strong passwords, secure them as necessary, and follow other password management best practices. + +## Run demos +Open and execute all the cells in *0_Demo_Environment_Setup.ipynb* to setup your environment. Followed by *1_Demo_Setup_Base_Data.ipynb* to load the base data required for demo. + +To learn more about the demo notebooks, go to [Teradata Lake demos](https://github.com/Teradata/lake-demos) page on GitHub. + +## Summary + +In this quick start we learned how to run Jupyter notebook demos for VantageCloud Lake in Amazon SageMaker. + +## Further reading + +* [Teradata VantageCloud Lake documentation](https://docs.teradata.com/r/Teradata-VantageCloud-Lake/Getting-Started-First-Sign-On-by-Organization-Admin) +* [Use Vantage from a Jupyter notebook](https://quickstarts.teradata.com/jupyter.html) \ No newline at end of file diff --git a/quickstarts/vantagecloud-lake/vantagecloud-lake-demos-visual-studio-code.md b/quickstarts/vantagecloud-lake/vantagecloud-lake-demos-visual-studio-code.md new file mode 100644 index 0000000000..d98a13868e --- /dev/null +++ b/quickstarts/vantagecloud-lake/vantagecloud-lake-demos-visual-studio-code.md @@ -0,0 +1,156 @@ +--- +sidebar_position: 3 +author: Janeth Graziani +email: Janeth.graziani@teradata.com +page_last_update: January 11, 2024 +description: Learn how to run VantageCloud Lake Demos using Jupyter notebooks in Visual Studio Code. +keywords: [data warehouses, compute storage separation, teradata, vantage, cloud data platform, business intelligence, enterprise analytics, jupyter, teradatasql, ipython-sql, teradatasqlalchemy, vantagecloud, vantagecloud lake, public internet, visual studio code, IDE, data analytics, data science] +--- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Run Teradata Jupyter Notebook Demos for VantageCloud Lake in Visual Studio Code + +## Overview +Visual Studio Code is a popular open-source code editor compatible with Windows, MacOs, and Linux. Developers use this Integrated Development Environment [IDE) for coding, debugging, building, and deploying applications. In this quickstart guide, we launch VantageCloud Lake Jupyter notebook demos within Visual Studio Code. + +![vscode.png](./images/vantagecloud-lake-demos-visual-studio-code/vscode.png) + +## Prerequisites +Before you begin, ensure you have the following prerequisites in place: + +* [Docker Desktop](https://www.docker.com/products/docker-desktop) installed +* [Git](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git) installed +** Required to download git repo from https://github.com/Teradata/lake-demos.git +* [Visual Studio Code](https://code.visualstudio.com/download) installed +* A Teradata VantageCloud Lake account with organization URL and login details from the Teradata welcome letter +** Once logged in follow these [intructions](https://quickstarts.teradata.com/getting-started-with-vantagecloud-lake.html#_create_an_environment) to create a VantageCloud Lake Enviorment + +## Clone VantageCloud Lake Demo repository +Begin by cloning the GitHub repository and navigating to the project directory: +``` bash +git clone https://github.com/Teradata/lake-demos.git +cd lake-demos +``` +## Start a Jupyterlab docker container with Teradata Jupyter Exensions +To launch Teradata VantageCloud Lake demos, we need the [Teradata Jupyter Extensions for Docker](https://hub.docker.com/r/teradata/jupyterlab-extensions). These extensions provide the SQL ipython kernel, utilities to manage connections to Teradata, and the database object explorer to make you productive while interacting with the Teradata database. + +Next, start a container and bind it to the existing lake-demos directory. Choose the appropriate command based on your operating system: + +:::note +For Windows, run the docker command in PowerShell. +::: + +```mdx-code-block + + + ```bash + docker run -e "accept_license=Y" -p 127.0.0.1:8888:8888 -v ${PWD}:/home/jovyan/JupyterLabRoot teradata/jupyterlab-extensions + ``` + + + ```bash + docker run -e "accept_license=Y" -p 127.0.0.1:8888:8888 -v $PWD:/home/jovyan/JupyterLabRoot teradata/jupyterlab-extensions + ``` + + + ```bash + docker run -e "accept_license=Y" -p 127.0.0.1:8888:8888 -v $PWD:/home/jovyan/JupyterLabRoot teradata/jupyterlab-extensions + ``` + + +``` + +Take note of the resulting URL and token; you’ll need them to establish the connection from Visual Studio Code. + +![terminal.png](./images/vantagecloud-lake-demos-visual-studio-code/terminal.png) + +## Visual Studio Code Configuration +Open `lake-demos` project directory in Visual Studio Code. The repository contains the following project tree: + +LAKE_DEMOS + +* [UseCases](https://github.com/Teradata/lake-demos/tree/main/UseCases) + * [0_Demo_Environment_Setup.ipynb](https://github.com/Teradata/lake-demos/blob/main/0_Demo_Environment_Setup.ipynb) + * [1_Load_Base_Demo_Data.ipynb](https://github.com/Teradata/lake-demos/blob/main/1_Load_Base_Demo_Data.ipynb) + * [Data_Engineering_Exploration.ipynb](https://github.com/Teradata/lake-demos/blob/main/Data_Engineering_Exploration.ipynb) + * [Data_Science_OAF.ipynb](https://github.com/Teradata/lake-demos/blob/main/Data_Science_OAF.ipynb) + * [Demo_Admin.ipynb](https://github.com/Teradata/lake-demos/blob/main/Demo_Admin.ipynb) +* [vars.json file](https://github.com/Teradata/lake-demos/blob/main/vars.json) + +### Edit vars.json file +Edit the *[vars.json](https://github.com/Teradata/lake-demos/blob/main/vars.json)* file to include the required credentials to run the demos + + + + +| **Variable** | **Value** | +|--------------|-----------| +| *"host"* | Public IP value from your VantageCloud Lake environment | +| *"UES_URI"* | Open Analytics from your VantageCloud Lake environment | +| *"dbc"* | The master password of your VantageCloud Lake environment. | + + +To retrieve a Public IP address and Open Analytics Endpoint follow these [instructions](https://quickstarts.teradata.com/vantagecloud-lake/vantagecloud-lake-demo-jupyter-docker.html). + +:::info +Change passwords in the vars.json file. You'll see that in the sample vars.json, the passwords of all users are defaulted to "password", this is just for matters of the sample file, you should change all of these password fields to strong passwords, secure them as necessary and follow other password management best practices. +::: + +### Modify path to vars.json in UseCases directory + +In the UseCases directory, all .ipynb files use the path ../../vars.json to load the variables from the JSON file when working from Jupyterlab. To work directly from Visual Studio Code, update the code in each .ipynb to point to vars.json. + +The quickest way to make these changes is via search feature on the left vertical menu. Search for + +``` +'../../vars.json' +``` + +and replace with: + +``` +'vars.json' +``` + +![search](./images/vantagecloud-lake-demos-visual-studio-code/search.png) + +![replace](./images/vantagecloud-lake-demos-visual-studio-code/replace.png) + +### Configuring Jupyter Kernels +Open *0_Demo_Environment_Setup.ipynb* and click on Select Kernel at the top right corner of Visual Studio Code. + +If you have not installed Jupyter and Python extensions, Visual Studio Code will prompt you to install them. These extensions are necessary for Visual Studio Code to detect Kernels. To install them, select 'Install/Enable suggested extensions for Python and Jupyter.' + +![select.kernel.png](./images/vantagecloud-lake-demos-visual-studio-code/select.kernel.png) + +Once you've installed the necessary extensions, you'll find options in the drop-down menu. Choose **Existing Jupyter Kernel**. + +![existing.kernel.png](./images/vantagecloud-lake-demos-visual-studio-code/existing.kernel.png) + +Enter the URL of the running Jupyter Server and press enter. +``` +http://localhost:8888 +``` +![server.url.png](./images/vantagecloud-lake-demos-visual-studio-code/server.url.png) + +Enter the token found in your terminal when mounting files to the Docker container and press Enter. + +![server.password.png](./images/vantagecloud-lake-demos-visual-studio-code/server.password.png) + +Change Server Display Name (Leave Blank To Use URL) + +![server.display.name.png](./images/vantagecloud-lake-demos-visual-studio-code/server.display.name.png) + +You now have access to all the Teradata Vantage extension kernels. Select Python 3 (ipykernel) from the running Jupyter server. + +![python.kernel.png](./images/vantagecloud-lake-demos-visual-studio-code/python.kernel.png) + +### Run demos +Execute all the cells in *0_Demo_Environment_Setup.ipynb* to setup your environment. Followed by *1_Demo_Setup_Base_Data.ipynb* to load the base data required for demo. +To learn more about the demo notebooks, go to [Teradata Lake demos](https://github.com/Teradata/lake-demos) page on GitHub. + +![demoenvsetup.png](./images/vantagecloud-lake-demos-visual-studio-code/demoenvsetup.png) + +## Summary +In this quickstart guide, we configured Visual Studio Code to access VantageCloud Lake demos using Jupyter notebooks. diff --git a/sidebars.js b/sidebars.js index 3327580322..1d54a4511c 100644 --- a/sidebars.js +++ b/sidebars.js @@ -30,4 +30,4 @@ const sidebars = { */ }; -export default sidebars; +export default sidebars; \ No newline at end of file diff --git a/src/components/QuickstartsPageCategories/index.js b/src/components/QuickstartsPageCategories/index.js new file mode 100644 index 0000000000..f659e9d131 --- /dev/null +++ b/src/components/QuickstartsPageCategories/index.js @@ -0,0 +1,75 @@ +import clsx from 'clsx'; +import Heading from '@theme/Heading'; +import styles from './styles.module.css'; +import Link from '@docusaurus/Link'; + +const FeatureTitle = 'Categories'; +const FeatureList = [ + { + title: `Introduction`, + description: 'Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.', + href: '/quickstarts/introduction/teradata-vantage-engine-architecture-and-concepts/', + }, + { + title: 'Get access to Vantage', + description: 'Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.', + href: '/quickstarts/get-access-to-vantage/on-your-local/getting-started-vmware/', + }, + { + title: 'Connect to Vantage', + description: 'Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.', + href: '/quickstarts/connect-to-vantage/install-teradata-studio-on-mac-m1-m2/', + }, + { + title: 'Manage data', + description: 'Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.', + href: '/quickstarts/manage-data/nos/', + }, + { + title: 'Connect applications', + description: 'Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.', + href: '/quickstarts/create-applications/jdbc/', + }, + { + title: 'Analyze data', + description: 'Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.', + href: '/quickstarts/analyze-data/jupyter/', + }, + { + title: 'Teradata AI Unlimited', + description: 'Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.', + href: '/ai-unlimited-docs/', + }, + { + title: 'VantageCloud Lake', + description: 'Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. ', + href: '/quickstarts/vantagecloud-lake/getting-started-with-vantagecloud-lake/', + }, +]; + + +function Feature({ title, description, href }) { + return ( + +
+ {title} + {description &&

{description}

} +
+ + ); +} + +export default function HomepageFeatures() { + return ( +
+
+

{FeatureTitle}

+
+ {FeatureList.map((props, idx) => ( + + ))} +
+
+
+ ); +} diff --git a/src/components/QuickstartsPageCategories/styles.module.css b/src/components/QuickstartsPageCategories/styles.module.css new file mode 100644 index 0000000000..c972d93c19 --- /dev/null +++ b/src/components/QuickstartsPageCategories/styles.module.css @@ -0,0 +1,96 @@ +.features { + display: flex; + align-items: center; + padding: 2rem 0; + background: var(--ifm-color-white); + gap: 1.5rem; +} + +.features h2 { + color: hsl(205, 100%, 12%); + margin-bottom: 40px; + + /* Desktop/H2 */ + font-family: "Inter", sans-serif; + font-size: 42px; + font-style: normal; + font-weight: 300; + line-height: 52px; /* 123.81% */ + letter-spacing: -1.26px; +} + +a.col { + text-decoration: none; +} + +.col { + padding: 0 0.75rem 1.5rem; +} + +.card { + --ifm-spacing-horizontal: 0.25rem; + + border-radius: 12px; + border: 1px solid #CED3DA; + background: #FFF; + margin-bottom: 1rem; + padding: 1.5rem; +} + +.card h3 { + color: var(--Primary-Navy, #00233c); + /* Desktop/H3 */ + font-family: Inter; + font-size: 1.5rem; + font-style: normal; + font-weight: 600; + line-height: 34px; /* 141.667% */ + letter-spacing: -0.24px; + margin-bottom: 0; +} + +.featureSvg { + height: 200px; + width: 200px; +} + +.container { + padding: 0 1.5rem; +} +.row { + margin: 0 -0.625rem; +} + + +@media screen and (min-width: 768px) and (max-width: 1024px) { + .container { + padding: 0 2.5rem 0; + } + + .col { + --ifm-col-width: 50%; + } +} + +@media screen and (max-width: 1024px) { + .features h2 { + font-size: 26px; + line-height: 36px; /* 138.462% */ + letter-spacing: -0.78px; + } +} + +@media screen and (min-width: 1025px) { + .container { + padding: 0 6.625rem 0; + margin: 0 auto; + width: 100%; + max-width: 1440px; + } +} + +@media (min-width: 1440px) { + .container { + max-width: 1440px; + } +} diff --git a/src/pages/quickstarts.js b/src/pages/quickstarts.js new file mode 100644 index 0000000000..f0ac5baeaa --- /dev/null +++ b/src/pages/quickstarts.js @@ -0,0 +1,50 @@ +import clsx from 'clsx'; +import Link from '@docusaurus/Link'; +import useDocusaurusContext from '@docusaurus/useDocusaurusContext'; +import Layout from '@theme/Layout'; +import QuickstartsPageCategories from '@site/src/components/QuickstartsPageCategories'; +import HeroImageUrl from '@site/static/img/quickstarts-hero.png'; +import Translate from '@docusaurus/Translate'; +import Heading from '@theme/Heading'; +import styles from './index.module.css'; +function HomepageHeader() { + const { siteConfig } = useDocusaurusContext(); + return ( +
+
+
+ + Getting started + +

+ + Discover in-depth tutorials and guides about how to use Teradata Vantage from all members of the Teradata community, including employees, partners, customers. + +

+

+ Existing customer or partner? Explore courses at Teradata University. +

+
+
+ Two individuals collaborate at a desk with a computer, against an abstract orange and blue backdrop. +
+
+
+
+ ); +} + +export default function Home() { + const { siteConfig } = useDocusaurusContext(); + return ( + + +
+ +
+
+ ); +} diff --git a/static/img/quickstarts-hero.png b/static/img/quickstarts-hero.png new file mode 100644 index 0000000000..5ab82c423b Binary files /dev/null and b/static/img/quickstarts-hero.png differ