-
Notifications
You must be signed in to change notification settings - Fork 82
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
import-io/improvement #171
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
PLUGIN_VERSION=1.0.1 | ||
PLUGIN_VERSION=1.0.2 | ||
PLUGIN_ID=import-io | ||
|
||
plugin: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
{ | ||
"acceptedPythonInterpreters": ["PYTHON27","PYTHON35","PYTHON36"], | ||
"forceConda": false, | ||
"installCorePackages": true, | ||
"installJupyterSupport": false | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
import urllib, json | ||
import urllib | ||
import json | ||
from dataiku.customrecipe import * | ||
import importio_utils | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,12 @@ | ||
import requests, json | ||
import requests | ||
import json | ||
import pandas as pd | ||
from dataiku.connector import Connector | ||
import importio_utils | ||
import logging | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class ImportIOConnector(Connector): | ||
|
||
|
@@ -13,24 +18,23 @@ def __init__(self, config): | |
elif self.config['api_url'].startswith('https://extraction.import.io/'): | ||
self.api_version = 'extraction' | ||
else: | ||
raise Exception( | ||
'It looks like this URL is not an API URL. URLs to call the API (and get a json response) start with "https://api.import.io" .') | ||
print '[import.io connector] calling API...' | ||
raise Exception('It looks like this URL is not an API URL. URLs to call the API (and get a json response) start with "https://api.import.io" .') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it seems like from the above if/else the url can also start with another prefix? Might be confusing to not log that |
||
logger.info('[import.io connector] calling API...') | ||
response = requests.get(self.config['api_url']) | ||
print '[import.io connector] got response' | ||
logger.info('[import.io connector] got response') | ||
try: | ||
self.json = response.json() | ||
except Exception as e: | ||
print e | ||
print 'response was:\n', response.text | ||
raise | ||
logger.error(e) | ||
logger.error('response was:{}'.format(response.text)) | ||
raise ValueError | ||
|
||
def get_read_schema(self): | ||
if self.api_version == 'api': | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: couldn't api-version just be a None like value? |
||
columns = importio_utils.convert_schema(self.json['outputProperties']) | ||
return {"columns":columns} | ||
return {"columns": columns} | ||
else: | ||
return None | ||
return None | ||
|
||
def generate_rows(self, dataset_schema=None, dataset_partitioning=None, partition_id=None, records_limit = -1): | ||
if self.api_version == 'api': | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why are we testing this in every function? also why would api version be api |
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,13 @@ | ||
# coding: UTF8 | ||
import urlparse, requests, sys, dataiku, time | ||
import urlparse | ||
import requests | ||
import sys | ||
import dataiku | ||
import time | ||
from dataiku.customrecipe import * | ||
import logging | ||
|
||
logger = logging.getLogger(__name__) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Isn't basic config necessary? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 to Joachim, we should share config (nothing fancy, just simple stuff) across plugins) |
||
|
||
# See http://api.docs.import.io/#DataTypes | ||
importIO_subfields = { | ||
|
@@ -24,9 +31,7 @@ def convert_type(importIO_type): | |
def convert_schema(import_io_schema): | ||
result = [] | ||
for col in import_io_schema: | ||
result.append({ | ||
'name':col['name'], | ||
'type':convert_type(col['type'])}) | ||
result.append({'name':col['name'],'type':convert_type(col['type'])}) | ||
for subfield in importIO_subfields[col['type']]: | ||
result.append({'name':col['name']+'/'+subfield, 'type':'string'}) | ||
return result | ||
|
@@ -54,21 +59,24 @@ def run(build_query): | |
response = requests.get(request_url) | ||
json = response.json() | ||
except Exception as e: | ||
print 'request to import.io failed' | ||
print e | ||
print 'response was:\n',response | ||
raise | ||
logger.error('request to import.io failed') | ||
logger.error(e) | ||
logger.error('response was: {}'.format(response)) | ||
raise ValueError | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. aren't we masking exceptions? Why not just throw the actual exception? (There's sometimes reason to do this but I don't know if that is the case here as a number of things could be going wrong and it might help user to know) |
||
if 'error' in json: | ||
print "response: ", json | ||
logger.error("response: {}".format(json)) | ||
raise Exception(json['error']) | ||
for result_line in json['results']: | ||
if not output_schema: | ||
print "Setting schema" | ||
logger.error("Setting schema") | ||
input_schema_names = frozenset([e['name'] for e in input.read_schema()]) | ||
output_schema = input.read_schema() | ||
print(')))))))))') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. debug? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 with Joachim, idk what is going on here |
||
print(json) | ||
print(json.keys()) | ||
for col in convert_schema(json['outputProperties']): | ||
if col['name'] in input_schema_names: | ||
print "Warning: input col "+col['name']+" will be overwritten by output col with same name." | ||
logger.error("Warning: input col "+col['name']+" will be overwritten by output col with same name.") | ||
input_cols_to_drop.append(col['name']) | ||
else: | ||
output_schema.append(col) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
maybe more specifically type the Exception, not necessarily custom but just a more descriptive exception class
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
also we should probably log the exception proper (might be having a moment - does raise do that natively?)