forked from SatadruMukherjee/Data-Preprocessing-Models
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathUnstructured Data processing with Snowflake.txt
89 lines (63 loc) · 3.21 KB
/
Unstructured Data processing with Snowflake.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
--drop database if exists
drop database if exists s3_to_snowflake;
--Database Creation
create database if not exists s3_to_snowflake;
--Use the database
use s3_to_snowflake;
--create the external stage
create or replace stage s3_to_snowflake.PUBLIC.Snow_stage url="{s3 location}"
credentials=(aws_key_id='{AWS Access Key}'
aws_secret_key='{AWS Secret Key}');
list @s3_to_snowflake.PUBLIC.Snow_stage;
CREATE OR REPLACE PROCEDURE count_no_of_pages_sp(file_name string)
RETURNS integer
LANGUAGE PYTHON
RUNTIME_VERSION = '3.8'
PACKAGES = ('snowflake-snowpark-python','PyPDF2')
HANDLER = 'main_fn'
AS
$$
from snowflake.snowpark.files import SnowflakeFile
import PyPDF2
def main_fn(session,file_name):
f=SnowflakeFile.open(file_name,'rb')
pdf_object=PyPDF2.PdfReader(f);
return len(pdf_object.pages)
$$;
call count_no_of_pages_sp(BUILD_SCOPED_FILE_URL( @s3_to_snowflake.PUBLIC.Snow_stage , 'crm_system.pdf' ));
--select BUILD_SCOPED_FILE_URL( @s3_to_snowflake.PUBLIC.Snow_stage , 'upgrades-white-paper-final.pdf' );
CREATE OR REPLACE function count_no_of_pages_udf(file_name string)
RETURNS integer
LANGUAGE PYTHON
RUNTIME_VERSION = '3.8'
PACKAGES = ('snowflake-snowpark-python','PyPDF2')
HANDLER = 'main_fn'
AS
$$
from snowflake.snowpark.files import SnowflakeFile
import PyPDF2
def main_fn(file_name):
f=SnowflakeFile.open(file_name,'rb')
pdf_object=PyPDF2.PdfReader(f);
return len(pdf_object.pages)
$$;
select count_no_of_pages_udf(BUILD_SCOPED_FILE_URL( @s3_to_snowflake.PUBLIC.Snow_stage , 'sample_pdf_1.pdf' ));
list @s3_to_snowflake.PUBLIC.Snow_stage;
SELECT "name",count_no_of_pages_udf(BUILD_SCOPED_FILE_URL( @s3_to_snowflake.PUBLIC.Snow_stage , file_name )) as pdf_page_count FROM
(select "name",split_part("name",'/',-1) as file_name from TABLE(RESULT_SCAN(LAST_QUERY_ID())));
--create the external stage with directory table enabled
create or replace stage s3_to_snowflake.PUBLIC.Snow_stage_directory_table_yt url="{s3 location}"
credentials=(aws_key_id='{AWS Access Key}'
aws_secret_key='{AWS Secret Key}')
Directory=(ENABLE=TRUE);
list @s3_to_snowflake.PUBLIC.Snow_stage_directory_table_yt;
select * from directory(@s3_to_snowflake.PUBLIC.Snow_stage_directory_table_yt);
SELECT RELATIVE_PATH,count_no_of_pages_udf(BUILD_SCOPED_FILE_URL( @s3_to_snowflake.PUBLIC.Snow_stage_directory_table_yt , RELATIVE_PATH )) as pdf_page_count FROM directory(@s3_to_snowflake.PUBLIC.Snow_stage_directory_table_yt);
alter stage s3_to_snowflake.PUBLIC.Snow_stage_directory_table_yt refresh;
--create the external stage with directory table enabled & automatic refresh
create or replace stage s3_to_snowflake.PUBLIC.Snow_stage_directory_table_yt_auto_refresh url="{s3 location}"
credentials=(aws_key_id='{AWS Access Key}'
aws_secret_key='{AWS Secret Key}')
Directory=(ENABLE=TRUE AUTO_REFRESH=TRUE);
desc stage s3_to_snowflake.PUBLIC.Snow_stage_directory_table_yt_auto_refresh;
SELECT RELATIVE_PATH,count_no_of_pages_udf(BUILD_SCOPED_FILE_URL( @s3_to_snowflake.PUBLIC.Snow_stage_directory_table_yt_auto_refresh , RELATIVE_PATH )) as pdf_page_count FROM directory(@s3_to_snowflake.PUBLIC.Snow_stage_directory_table_yt_auto_refresh);