From b03c6c3687c5efd4e4628bb3de8f18f6d9c5fd92 Mon Sep 17 00:00:00 2001 From: Nikolaos Veneti Date: Mon, 25 Nov 2019 11:28:04 +0100 Subject: [PATCH 01/78] add lastUpdated field in engagements schema --- tap_hubspot/schemas/engagements.json | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tap_hubspot/schemas/engagements.json b/tap_hubspot/schemas/engagements.json index 76607494..7cc93f7e 100644 --- a/tap_hubspot/schemas/engagements.json +++ b/tap_hubspot/schemas/engagements.json @@ -4,6 +4,9 @@ "engagement_id": { "type": "integer" }, + "lastUpdated": { + "type": ["null", "string"] + }, "engagement": { "type": "object", "properties": { From b8cfaa7bd59482df8313838d04c17342e1f622cc Mon Sep 17 00:00:00 2001 From: Nikolaos Veneti Date: Mon, 25 Nov 2019 20:15:52 +0100 Subject: [PATCH 02/78] add custom values in .gitignore --- .gitignore | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.gitignore b/.gitignore index 675377ad..110b8e55 100644 --- a/.gitignore +++ b/.gitignore @@ -98,3 +98,8 @@ config.json .autoenv.zsh *~ env-vars* +bq_config.json +catalog.json +Pipfile +stream.ndjson +settings.json \ No newline at end of file From 5212a6895e0493d22d8d4fedb0bc9e12b1229ffa Mon Sep 17 00:00:00 2001 From: Nikolaos Veneti Date: Mon, 25 Nov 2019 20:16:42 +0100 Subject: [PATCH 03/78] change schema to accept values with underscore for contacts_to_company --- tap_hubspot/schemas/contacts_by_company.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tap_hubspot/schemas/contacts_by_company.json b/tap_hubspot/schemas/contacts_by_company.json index dafd30b2..22e7ffbb 100644 --- a/tap_hubspot/schemas/contacts_by_company.json +++ b/tap_hubspot/schemas/contacts_by_company.json @@ -1,10 +1,10 @@ { "type": "object", "properties": { - "contact-id": { + "contact_id": { "type": ["integer"] }, - "company-id": { + "company_id": { "type": ["integer"] } }, From 0249ad6c5eea83fe993675ec56847bf287e8a3e7 Mon Sep 17 00:00:00 2001 From: Nikolaos Veneti Date: Mon, 25 Nov 2019 20:17:04 +0100 Subject: [PATCH 04/78] write contacts_to_company values with underscore --- tap_hubspot/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tap_hubspot/__init__.py b/tap_hubspot/__init__.py index 86d0c7fa..96a2e22a 100644 --- a/tap_hubspot/__init__.py +++ b/tap_hubspot/__init__.py @@ -396,8 +396,8 @@ def _sync_contacts_by_company(STATE, ctx, company_id): data = request(url, default_contacts_by_company_params).json() for row in data[path]: counter.increment() - record = {'company-id' : company_id, - 'contact-id' : row} + record = {'company_id' : company_id, + 'contact_id' : row} record = bumble_bee.transform(record, schema, mdata) singer.write_record("contacts_by_company", record, time_extracted=utils.now()) From d25eb163f44f0aba32b7e87e0ac2b703ed69124c Mon Sep 17 00:00:00 2001 From: Nikolaos Veneti Date: Mon, 25 Nov 2019 20:38:10 +0100 Subject: [PATCH 05/78] if contacts_by_company in ctx.selected_stream_ids --- tap_hubspot/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tap_hubspot/__init__.py b/tap_hubspot/__init__.py index 96a2e22a..fb6180fb 100644 --- a/tap_hubspot/__init__.py +++ b/tap_hubspot/__init__.py @@ -431,7 +431,7 @@ def sync_companies(STATE, ctx): max_bk_value = start if CONTACTS_BY_COMPANY in ctx.selected_stream_ids: contacts_by_company_schema = load_schema(CONTACTS_BY_COMPANY) - singer.write_schema("contacts_by_company", contacts_by_company_schema, ["company-id", "contact-id"]) + singer.write_schema("contacts_by_company", contacts_by_company_schema, ["company_id", "contact_id"]) with bumble_bee: for row in gen_request(STATE, 'companies', url, default_company_params, 'companies', 'has-more', ['offset'], ['offset']): From 3ceb026e528bc3cd69a9367846b72e59ebd7e5f5 Mon Sep 17 00:00:00 2001 From: Nikolaos Veneti Date: Tue, 26 Nov 2019 08:07:55 +0100 Subject: [PATCH 06/78] add Pipfile.lock in .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 110b8e55..b7f64394 100644 --- a/.gitignore +++ b/.gitignore @@ -101,5 +101,6 @@ env-vars* bq_config.json catalog.json Pipfile +Pipfile.lock stream.ndjson settings.json \ No newline at end of file From 61145a7557752f173c29b3a34406840b14d068ee Mon Sep 17 00:00:00 2001 From: Nikolaos Veneti Date: Thu, 28 Nov 2019 12:46:12 +0100 Subject: [PATCH 07/78] get companies by batch --- tap_hubspot/__init__.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/tap_hubspot/__init__.py b/tap_hubspot/__init__.py index fb6180fb..e6492f00 100644 --- a/tap_hubspot/__init__.py +++ b/tap_hubspot/__init__.py @@ -449,13 +449,17 @@ def sync_companies(STATE, ctx): if modified_time and modified_time >= max_bk_value: max_bk_value = modified_time - if not modified_time or modified_time >= start: - record = request(get_url("companies_detail", company_id=row['companyId'])).json() - record = bumble_bee.transform(record, schema, mdata) - singer.write_record("companies", record, catalog.get('stream_alias'), time_extracted=utils.now()) - if CONTACTS_BY_COMPANY in ctx.selected_stream_ids: - STATE = _sync_contacts_by_company(STATE, ctx, record['companyId']) + + + # if not modified_time or modified_time >= start: + + + record = row + record = bumble_bee.transform(record, schema, mdata) + singer.write_record("companies", record, catalog.get('stream_alias'), time_extracted=utils.now()) + # if CONTACTS_BY_COMPANY in ctx.selected_stream_ids: + # STATE = _sync_contacts_by_company(STATE, ctx, record['companyId']) # Don't bookmark past the start of this sync to account for updated records during the sync. new_bookmark = min(max_bk_value, current_sync_start) STATE = singer.write_bookmark(STATE, 'companies', bookmark_key, utils.strftime(new_bookmark)) From 435dbffb35f9d67e6e861245f2980ad252b8ef4e Mon Sep 17 00:00:00 2001 From: Nikolaos Veneti Date: Thu, 28 Nov 2019 13:18:55 +0100 Subject: [PATCH 08/78] add contacts by company --- tap_hubspot/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tap_hubspot/__init__.py b/tap_hubspot/__init__.py index e6492f00..7718ed68 100644 --- a/tap_hubspot/__init__.py +++ b/tap_hubspot/__init__.py @@ -458,8 +458,8 @@ def sync_companies(STATE, ctx): record = row record = bumble_bee.transform(record, schema, mdata) singer.write_record("companies", record, catalog.get('stream_alias'), time_extracted=utils.now()) - # if CONTACTS_BY_COMPANY in ctx.selected_stream_ids: - # STATE = _sync_contacts_by_company(STATE, ctx, record['companyId']) + if CONTACTS_BY_COMPANY in ctx.selected_stream_ids: + STATE = _sync_contacts_by_company(STATE, ctx, record['companyId']) # Don't bookmark past the start of this sync to account for updated records during the sync. new_bookmark = min(max_bk_value, current_sync_start) STATE = singer.write_bookmark(STATE, 'companies', bookmark_key, utils.strftime(new_bookmark)) From 5372cb9f73c297ec55a6b95c4f57347fdb730af1 Mon Sep 17 00:00:00 2001 From: Nikolaos Veneti Date: Thu, 28 Nov 2019 13:55:33 +0100 Subject: [PATCH 09/78] set count to 250 --- tap_hubspot/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tap_hubspot/__init__.py b/tap_hubspot/__init__.py index 7718ed68..a930b353 100644 --- a/tap_hubspot/__init__.py +++ b/tap_hubspot/__init__.py @@ -382,7 +382,7 @@ class ValidationPredFailed(Exception): def use_recent_companies_endpoint(response): return response["total"] < 10000 -default_contacts_by_company_params = {'count' : 100} +default_contacts_by_company_params = {'count' : 250} # NB> to do: support stream aliasing and field selection def _sync_contacts_by_company(STATE, ctx, company_id): From 2d419bf5fb6a94e892a6c0f93d7df6e79a5aa081 Mon Sep 17 00:00:00 2001 From: Nikolaos Veneti Date: Thu, 28 Nov 2019 14:40:40 +0100 Subject: [PATCH 10/78] request website, name and country for company --- tap_hubspot/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tap_hubspot/__init__.py b/tap_hubspot/__init__.py index a930b353..8c14ca8c 100644 --- a/tap_hubspot/__init__.py +++ b/tap_hubspot/__init__.py @@ -404,7 +404,7 @@ def _sync_contacts_by_company(STATE, ctx, company_id): return STATE default_company_params = { - 'limit': 250, 'properties': ["createdate", "hs_lastmodifieddate"] + 'limit': 250, 'properties': ["website", "name","country", "createdate", "hs_lastmodifieddate"] } def sync_companies(STATE, ctx): From 99de19132e0afef54b908ae3ee3d704739be1577 Mon Sep 17 00:00:00 2001 From: Nikolaos Veneti Date: Thu, 28 Nov 2019 15:03:30 +0100 Subject: [PATCH 11/78] support domain in batch get for companies --- tap_hubspot/__init__.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/tap_hubspot/__init__.py b/tap_hubspot/__init__.py index 8c14ca8c..881375db 100644 --- a/tap_hubspot/__init__.py +++ b/tap_hubspot/__init__.py @@ -404,7 +404,7 @@ def _sync_contacts_by_company(STATE, ctx, company_id): return STATE default_company_params = { - 'limit': 250, 'properties': ["website", "name","country", "createdate", "hs_lastmodifieddate"] + 'limit': 250, 'properties': ["website", "name","country", "domain", "createdate", "hs_lastmodifieddate"] } def sync_companies(STATE, ctx): @@ -449,14 +449,7 @@ def sync_companies(STATE, ctx): if modified_time and modified_time >= max_bk_value: max_bk_value = modified_time - - - # if not modified_time or modified_time >= start: - - - - record = row - record = bumble_bee.transform(record, schema, mdata) + record = bumble_bee.transform(row, schema, mdata) singer.write_record("companies", record, catalog.get('stream_alias'), time_extracted=utils.now()) if CONTACTS_BY_COMPANY in ctx.selected_stream_ids: STATE = _sync_contacts_by_company(STATE, ctx, record['companyId']) From 1cc06bb1ea20df98e8440ba1c2c7e34f8bc75afb Mon Sep 17 00:00:00 2001 From: "Patrick-Ranjit D. Madsen" Date: Mon, 6 Jan 2020 14:44:04 +0100 Subject: [PATCH 12/78] black commit --- tap_hubspot/__init__.py | 843 ++++++++++++++++++++++++++-------------- 1 file changed, 554 insertions(+), 289 deletions(-) diff --git a/tap_hubspot/__init__.py b/tap_hubspot/__init__.py index 881375db..09f8bb8e 100644 --- a/tap_hubspot/__init__.py +++ b/tap_hubspot/__init__.py @@ -15,28 +15,37 @@ import singer.metrics as metrics from singer import metadata from singer import utils -from singer import (transform, - UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING, - Transformer, _transform_datetime) +from singer import ( + transform, + UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING, + Transformer, + _transform_datetime, +) LOGGER = singer.get_logger() SESSION = requests.Session() + class InvalidAuthException(Exception): pass + class SourceUnavailableException(Exception): pass + class DependencyException(Exception): pass + class DataFields: - offset = 'offset' + offset = "offset" + class StateFields: - offset = 'offset' - this_stream = 'this_stream' + offset = "offset" + this_stream = "this_stream" + BASE_URL = "https://api.hubapi.com" @@ -49,7 +58,6 @@ class StateFields: "token_expires": None, "email_chunk_size": DEFAULT_CHUNK_SIZE, "subscription_chunk_size": DEFAULT_CHUNK_SIZE, - # in config.json "redirect_uri": None, "client_id": None, @@ -61,55 +69,55 @@ class StateFields: } ENDPOINTS = { - "contacts_properties": "/properties/v1/contacts/properties", - "contacts_all": "/contacts/v1/lists/all/contacts/all", - "contacts_recent": "/contacts/v1/lists/recently_updated/contacts/recent", - "contacts_detail": "/contacts/v1/contact/vids/batch/", - + "contacts_properties": "/properties/v1/contacts/properties", + "contacts_all": "/contacts/v1/lists/all/contacts/all", + "contacts_recent": "/contacts/v1/lists/recently_updated/contacts/recent", + "contacts_detail": "/contacts/v1/contact/vids/batch/", "companies_properties": "/companies/v2/properties", - "companies_all": "/companies/v2/companies/paged", - "companies_recent": "/companies/v2/companies/recent/modified", - "companies_detail": "/companies/v2/companies/{company_id}", - "contacts_by_company": "/companies/v2/companies/{company_id}/vids", - - "deals_properties": "/properties/v1/deals/properties", - "deals_all": "/deals/v1/deal/paged", - "deals_recent": "/deals/v1/deal/recent/modified", - "deals_detail": "/deals/v1/deal/{deal_id}", - - "deal_pipelines": "/deals/v1/pipelines", - - "campaigns_all": "/email/public/v1/campaigns/by-id", - "campaigns_detail": "/email/public/v1/campaigns/{campaign_id}", - - "engagements_all": "/engagements/v1/engagements/paged", - + "companies_all": "/companies/v2/companies/paged", + "companies_recent": "/companies/v2/companies/recent/modified", + "companies_detail": "/companies/v2/companies/{company_id}", + "contacts_by_company": "/companies/v2/companies/{company_id}/vids", + "deals_properties": "/properties/v1/deals/properties", + "deals_all": "/deals/v1/deal/paged", + "deals_recent": "/deals/v1/deal/recent/modified", + "deals_detail": "/deals/v1/deal/{deal_id}", + "deal_pipelines": "/deals/v1/pipelines", + "campaigns_all": "/email/public/v1/campaigns/by-id", + "campaigns_detail": "/email/public/v1/campaigns/{campaign_id}", + "engagements_all": "/engagements/v1/engagements/paged", "subscription_changes": "/email/public/v1/subscriptions/timeline", - "email_events": "/email/public/v1/events", - "contact_lists": "/contacts/v1/lists", - "forms": "/forms/v2/forms", - "workflows": "/automation/v3/workflows", - "owners": "/owners/v2/owners", + "email_events": "/email/public/v1/events", + "contact_lists": "/contacts/v1/lists", + "forms": "/forms/v2/forms", + "workflows": "/automation/v3/workflows", + "owners": "/owners/v2/owners", } + def get_start(state, tap_stream_id, bookmark_key): current_bookmark = singer.get_bookmark(state, tap_stream_id, bookmark_key) if current_bookmark is None: - return CONFIG['start_date'] + return CONFIG["start_date"] return current_bookmark + def get_current_sync_start(state, tap_stream_id): - current_sync_start_value = singer.get_bookmark(state, tap_stream_id, "current_sync_start") + current_sync_start_value = singer.get_bookmark( + state, tap_stream_id, "current_sync_start" + ) if current_sync_start_value is None: return current_sync_start_value return utils.strptime_to_utc(current_sync_start_value) + def write_current_sync_start(state, tap_stream_id, start): value = start if start is not None: value = utils.strftime(start) return singer.write_bookmark(state, tap_stream_id, "current_sync_start", value) + def clean_state(state): """ Clear deprecated keys out of state. """ for stream, bookmark_map in state.get("bookmarks", {}).items(): @@ -117,6 +125,7 @@ def clean_state(state): LOGGER.info("{} - Removing last_sync_duration from state.".format(stream)) state["bookmarks"][stream].pop("last_sync_duration", None) + def get_url(endpoint, **kwargs): if endpoint not in ENDPOINTS: raise ValueError("Invalid endpoint {}".format(endpoint)) @@ -129,8 +138,7 @@ def get_field_type_schema(field_type): return {"type": ["null", "boolean"]} elif field_type == "datetime": - return {"type": ["null", "string"], - "format": "date-time"} + return {"type": ["null", "string"], "format": "date-time"} elif field_type == "number": # A value like 'N/A' can be returned for this type, @@ -140,6 +148,7 @@ def get_field_type_schema(field_type): else: return {"type": ["null", "string"]} + def get_field_schema(field_type, extras=False): if extras: return { @@ -149,93 +158,107 @@ def get_field_schema(field_type, extras=False): "timestamp": get_field_type_schema("datetime"), "source": get_field_type_schema("string"), "sourceId": get_field_type_schema("string"), - } + }, } else: return { "type": "object", - "properties": { - "value": get_field_type_schema(field_type), - } + "properties": {"value": get_field_type_schema(field_type)}, } + def parse_custom_schema(entity_name, data): return { - field['name']: get_field_schema( - field['type'], entity_name != "contacts") + field["name"]: get_field_schema(field["type"], entity_name != "contacts") for field in data } def get_custom_schema(entity_name): - return parse_custom_schema(entity_name, request(get_url(entity_name + "_properties")).json()) + return parse_custom_schema( + entity_name, request(get_url(entity_name + "_properties")).json() + ) def get_abs_path(path): return os.path.join(os.path.dirname(os.path.realpath(__file__)), path) + def load_associated_company_schema(): associated_company_schema = load_schema("companies") - #pylint: disable=line-too-long - associated_company_schema['properties']['company-id'] = associated_company_schema['properties'].pop('companyId') - associated_company_schema['properties']['portal-id'] = associated_company_schema['properties'].pop('portalId') + # pylint: disable=line-too-long + associated_company_schema["properties"]["company-id"] = associated_company_schema[ + "properties" + ].pop("companyId") + associated_company_schema["properties"]["portal-id"] = associated_company_schema[ + "properties" + ].pop("portalId") return associated_company_schema + def load_schema(entity_name): - schema = utils.load_json(get_abs_path('schemas/{}.json'.format(entity_name))) + schema = utils.load_json(get_abs_path("schemas/{}.json".format(entity_name))) if entity_name in ["contacts", "companies", "deals"]: custom_schema = get_custom_schema(entity_name) - schema['properties']['properties'] = { + schema["properties"]["properties"] = { "type": "object", "properties": custom_schema, } if entity_name == "contacts": - schema['properties']['associated-company'] = load_associated_company_schema() + schema["properties"]["associated-company"] = load_associated_company_schema() return schema -#pylint: disable=invalid-name + +# pylint: disable=invalid-name def acquire_access_token_from_refresh_token(): payload = { "grant_type": "refresh_token", - "redirect_uri": CONFIG['redirect_uri'], - "refresh_token": CONFIG['refresh_token'], - "client_id": CONFIG['client_id'], - "client_secret": CONFIG['client_secret'], + "redirect_uri": CONFIG["redirect_uri"], + "refresh_token": CONFIG["refresh_token"], + "client_id": CONFIG["client_id"], + "client_secret": CONFIG["client_secret"], } - resp = requests.post(BASE_URL + "/oauth/v1/token", data=payload) if resp.status_code == 403: raise InvalidAuthException(resp.content) resp.raise_for_status() auth = resp.json() - CONFIG['access_token'] = auth['access_token'] - CONFIG['refresh_token'] = auth['refresh_token'] - CONFIG['token_expires'] = ( - datetime.datetime.utcnow() + - datetime.timedelta(seconds=auth['expires_in'] - 600)) - LOGGER.info("Token refreshed. Expires at %s", CONFIG['token_expires']) + CONFIG["access_token"] = auth["access_token"] + CONFIG["refresh_token"] = auth["refresh_token"] + CONFIG["token_expires"] = datetime.datetime.utcnow() + datetime.timedelta( + seconds=auth["expires_in"] - 600 + ) + LOGGER.info("Token refreshed. Expires at %s", CONFIG["token_expires"]) def giveup(exc): - return exc.response is not None \ - and 400 <= exc.response.status_code < 500 \ + return ( + exc.response is not None + and 400 <= exc.response.status_code < 500 and exc.response.status_code != 429 + ) + def on_giveup(details): - if len(details['args']) == 2: - url, params = details['args'] + if len(details["args"]) == 2: + url, params = details["args"] else: - url = details['args'] + url = details["args"] params = {} - raise Exception("Giving up on request after {} tries with url {} and params {}" \ - .format(details['tries'], url, params)) + raise Exception( + "Giving up on request after {} tries with url {} and params {}".format( + details["tries"], url, params + ) + ) + + +URL_SOURCE_RE = re.compile(BASE_URL + r"/(\w+)/") -URL_SOURCE_RE = re.compile(BASE_URL + r'/(\w+)/') def parse_source_from_url(url): match = URL_SOURCE_RE.match(url) @@ -244,30 +267,34 @@ def parse_source_from_url(url): return None -@backoff.on_exception(backoff.constant, - (requests.exceptions.RequestException, - requests.exceptions.HTTPError), - max_tries=5, - jitter=None, - giveup=giveup, - on_giveup=on_giveup, - interval=10) +@backoff.on_exception( + backoff.constant, + (requests.exceptions.RequestException, requests.exceptions.HTTPError), + max_tries=5, + jitter=None, + giveup=giveup, + on_giveup=on_giveup, + interval=10, +) def request(url, params=None): params = params or {} - hapikey = CONFIG['hapikey'] + hapikey = CONFIG["hapikey"] if hapikey is None: - if CONFIG['token_expires'] is None or CONFIG['token_expires'] < datetime.datetime.utcnow(): + if ( + CONFIG["token_expires"] is None + or CONFIG["token_expires"] < datetime.datetime.utcnow() + ): acquire_access_token_from_refresh_token() - headers = {'Authorization': 'Bearer {}'.format(CONFIG['access_token'])} + headers = {"Authorization": "Bearer {}".format(CONFIG["access_token"])} else: - params['hapikey'] = hapikey + params["hapikey"] = hapikey headers = {} - if 'user_agent' in CONFIG: - headers['User-Agent'] = CONFIG['user_agent'] + if "user_agent" in CONFIG: + headers["User-Agent"] = CONFIG["user_agent"] - req = requests.Request('GET', url, params=params, headers=headers).prepare() + req = requests.Request("GET", url, params=params, headers=headers).prepare() LOGGER.info("GET %s", req.url) with metrics.http_request_timer(parse_source_from_url(url)) as timer: resp = SESSION.send(req) @@ -278,6 +305,8 @@ def request(url, params=None): resp.raise_for_status() return resp + + # {"bookmarks" : {"contacts" : { "lastmodifieddate" : "2001-01-01" # "offset" : {"vidOffset": 1234 # "timeOffset": "3434434 }} @@ -286,8 +315,10 @@ def request(url, params=None): # } # } -#pylint: disable=line-too-long -def gen_request(STATE, tap_stream_id, url, params, path, more_key, offset_keys, offset_targets): +# pylint: disable=line-too-long +def gen_request( + STATE, tap_stream_id, url, params, path, more_key, offset_keys, offset_targets +): if len(offset_keys) != len(offset_targets): raise ValueError("Number of offset_keys must match number of offset_targets") @@ -321,45 +352,67 @@ def _sync_contact_vids(catalog, vids, schema, bumble_bee): if len(vids) == 0: return - data = request(get_url("contacts_detail"), params={'vid': vids, 'showListMemberships' : True, "formSubmissionMode" : "all"}).json() + data = request( + get_url("contacts_detail"), + params={"vid": vids, "showListMemberships": True, "formSubmissionMode": "all"}, + ).json() time_extracted = utils.now() - mdata = metadata.to_map(catalog.get('metadata')) + mdata = metadata.to_map(catalog.get("metadata")) for record in data.values(): record = bumble_bee.transform(record, schema, mdata) - singer.write_record("contacts", record, catalog.get('stream_alias'), time_extracted=time_extracted) + singer.write_record( + "contacts", + record, + catalog.get("stream_alias"), + time_extracted=time_extracted, + ) + default_contact_params = { - 'showListMemberships': True, - 'includeVersion': True, - 'count': 100, + "showListMemberships": True, + "includeVersion": True, + "count": 100, } + def sync_contacts(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) - bookmark_key = 'versionTimestamp' + bookmark_key = "versionTimestamp" start = utils.strptime_with_tz(get_start(STATE, "contacts", bookmark_key)) LOGGER.info("sync_contacts from %s", start) max_bk_value = start schema = load_schema("contacts") - singer.write_schema("contacts", schema, ["vid"], [bookmark_key], catalog.get('stream_alias')) + singer.write_schema( + "contacts", schema, ["vid"], [bookmark_key], catalog.get("stream_alias") + ) url = get_url("contacts_all") vids = [] with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: - for row in gen_request(STATE, 'contacts', url, default_contact_params, 'contacts', 'has-more', ['vid-offset'], ['vidOffset']): + for row in gen_request( + STATE, + "contacts", + url, + default_contact_params, + "contacts", + "has-more", + ["vid-offset"], + ["vidOffset"], + ): modified_time = None if bookmark_key in row: modified_time = utils.strptime_with_tz( - _transform_datetime( # pylint: disable=protected-access - row[bookmark_key], - UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING)) + _transform_datetime( # pylint: disable=protected-access + row[bookmark_key], UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING + ) + ) if not modified_time or modified_time >= start: - vids.append(row['vid']) + vids.append(row["vid"]) if modified_time and modified_time >= max_bk_value: max_bk_value = modified_time @@ -370,52 +423,70 @@ def sync_contacts(STATE, ctx): _sync_contact_vids(catalog, vids, schema, bumble_bee) - STATE = singer.write_bookmark(STATE, 'contacts', bookmark_key, utils.strftime(max_bk_value)) + STATE = singer.write_bookmark( + STATE, "contacts", bookmark_key, utils.strftime(max_bk_value) + ) singer.write_state(STATE) return STATE + class ValidationPredFailed(Exception): pass + # companies_recent only supports 10,000 results. If there are more than this, # we'll need to use the companies_all endpoint def use_recent_companies_endpoint(response): return response["total"] < 10000 -default_contacts_by_company_params = {'count' : 250} + +default_contacts_by_company_params = {"count": 250} # NB> to do: support stream aliasing and field selection def _sync_contacts_by_company(STATE, ctx, company_id): schema = load_schema(CONTACTS_BY_COMPANY) catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) - mdata = metadata.to_map(catalog.get('metadata')) + mdata = metadata.to_map(catalog.get("metadata")) url = get_url("contacts_by_company", company_id=company_id) - path = 'vids' + path = "vids" with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: with metrics.record_counter(CONTACTS_BY_COMPANY) as counter: data = request(url, default_contacts_by_company_params).json() for row in data[path]: counter.increment() - record = {'company_id' : company_id, - 'contact_id' : row} + record = {"company_id": company_id, "contact_id": row} record = bumble_bee.transform(record, schema, mdata) - singer.write_record("contacts_by_company", record, time_extracted=utils.now()) + singer.write_record( + "contacts_by_company", record, time_extracted=utils.now() + ) return STATE + default_company_params = { - 'limit': 250, 'properties': ["website", "name","country", "domain", "createdate", "hs_lastmodifieddate"] + "limit": 250, + "properties": [ + "website", + "name", + "country", + "domain", + "createdate", + "hs_lastmodifieddate", + ], } + def sync_companies(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) - mdata = metadata.to_map(catalog.get('metadata')) + mdata = metadata.to_map(catalog.get("metadata")) bumble_bee = Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) - bookmark_key = 'hs_lastmodifieddate' + bookmark_key = "hs_lastmodifieddate" start = utils.strptime_to_utc(get_start(STATE, "companies", bookmark_key)) LOGGER.info("sync_companies from %s", start) - schema = load_schema('companies') - singer.write_schema("companies", schema, ["companyId"], [bookmark_key], catalog.get('stream_alias')) + schema = load_schema("companies") + singer.write_schema( + "companies", schema, ["companyId"], [bookmark_key], catalog.get("stream_alias") + ) # Because this stream doesn't query by `lastUpdated`, it cycles # through the data set every time. The issue with this is that there @@ -431,112 +502,169 @@ def sync_companies(STATE, ctx): max_bk_value = start if CONTACTS_BY_COMPANY in ctx.selected_stream_ids: contacts_by_company_schema = load_schema(CONTACTS_BY_COMPANY) - singer.write_schema("contacts_by_company", contacts_by_company_schema, ["company_id", "contact_id"]) + singer.write_schema( + "contacts_by_company", + contacts_by_company_schema, + ["company_id", "contact_id"], + ) with bumble_bee: - for row in gen_request(STATE, 'companies', url, default_company_params, 'companies', 'has-more', ['offset'], ['offset']): - row_properties = row['properties'] + for row in gen_request( + STATE, + "companies", + url, + default_company_params, + "companies", + "has-more", + ["offset"], + ["offset"], + ): + row_properties = row["properties"] modified_time = None if bookmark_key in row_properties: # Hubspot returns timestamps in millis - timestamp_millis = row_properties[bookmark_key]['timestamp'] / 1000.0 - modified_time = datetime.datetime.fromtimestamp(timestamp_millis, datetime.timezone.utc) - elif 'createdate' in row_properties: + timestamp_millis = row_properties[bookmark_key]["timestamp"] / 1000.0 + modified_time = datetime.datetime.fromtimestamp( + timestamp_millis, datetime.timezone.utc + ) + elif "createdate" in row_properties: # Hubspot returns timestamps in millis - timestamp_millis = row_properties['createdate']['timestamp'] / 1000.0 - modified_time = datetime.datetime.fromtimestamp(timestamp_millis, datetime.timezone.utc) + timestamp_millis = row_properties["createdate"]["timestamp"] / 1000.0 + modified_time = datetime.datetime.fromtimestamp( + timestamp_millis, datetime.timezone.utc + ) if modified_time and modified_time >= max_bk_value: max_bk_value = modified_time - record = bumble_bee.transform(row, schema, mdata) - singer.write_record("companies", record, catalog.get('stream_alias'), time_extracted=utils.now()) + record = bumble_bee.transform(row, schema, mdata) + singer.write_record( + "companies", + record, + catalog.get("stream_alias"), + time_extracted=utils.now(), + ) if CONTACTS_BY_COMPANY in ctx.selected_stream_ids: - STATE = _sync_contacts_by_company(STATE, ctx, record['companyId']) + STATE = _sync_contacts_by_company(STATE, ctx, record["companyId"]) # Don't bookmark past the start of this sync to account for updated records during the sync. new_bookmark = min(max_bk_value, current_sync_start) - STATE = singer.write_bookmark(STATE, 'companies', bookmark_key, utils.strftime(new_bookmark)) - STATE = write_current_sync_start(STATE, 'companies', None) + STATE = singer.write_bookmark( + STATE, "companies", bookmark_key, utils.strftime(new_bookmark) + ) + STATE = write_current_sync_start(STATE, "companies", None) singer.write_state(STATE) return STATE + def sync_deals(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) - mdata = metadata.to_map(catalog.get('metadata')) - bookmark_key = 'hs_lastmodifieddate' + mdata = metadata.to_map(catalog.get("metadata")) + bookmark_key = "hs_lastmodifieddate" start = utils.strptime_with_tz(get_start(STATE, "deals", bookmark_key)) max_bk_value = start LOGGER.info("sync_deals from %s", start) most_recent_modified_time = start - params = {'count': 250, - 'includeAssociations': False, - 'properties' : []} + params = {"count": 250, "includeAssociations": False, "properties": []} schema = load_schema("deals") - singer.write_schema("deals", schema, ["dealId"], [bookmark_key], catalog.get('stream_alias')) + singer.write_schema( + "deals", schema, ["dealId"], [bookmark_key], catalog.get("stream_alias") + ) # Check if we should include associations for key in mdata.keys(): - if 'associations' in key: + if "associations" in key: assoc_mdata = mdata.get(key) - if (assoc_mdata.get('selected') and assoc_mdata.get('selected') == True): - params['includeAssociations'] = True + if assoc_mdata.get("selected") and assoc_mdata.get("selected") == True: + params["includeAssociations"] = True # Append all the properties fields for deals to the request if # properties is selectedOB - if mdata.get(('properties', 'properties'), {}).get('selected'): - additional_properties = schema.get("properties").get("properties").get("properties") + if mdata.get(("properties", "properties"), {}).get("selected"): + additional_properties = ( + schema.get("properties").get("properties").get("properties") + ) for key in additional_properties.keys(): - params['properties'].append(key) + params["properties"].append(key) - url = get_url('deals_all') + url = get_url("deals_all") with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: - for row in gen_request(STATE, 'deals', url, params, 'deals', "hasMore", ["offset"], ["offset"]): - row_properties = row['properties'] + for row in gen_request( + STATE, "deals", url, params, "deals", "hasMore", ["offset"], ["offset"] + ): + row_properties = row["properties"] modified_time = None if bookmark_key in row_properties: # Hubspot returns timestamps in millis - timestamp_millis = row_properties[bookmark_key]['timestamp'] / 1000.0 - modified_time = datetime.datetime.fromtimestamp(timestamp_millis, datetime.timezone.utc) - elif 'createdate' in row_properties: + timestamp_millis = row_properties[bookmark_key]["timestamp"] / 1000.0 + modified_time = datetime.datetime.fromtimestamp( + timestamp_millis, datetime.timezone.utc + ) + elif "createdate" in row_properties: # Hubspot returns timestamps in millis - timestamp_millis = row_properties['createdate']['timestamp'] / 1000.0 - modified_time = datetime.datetime.fromtimestamp(timestamp_millis, datetime.timezone.utc) + timestamp_millis = row_properties["createdate"]["timestamp"] / 1000.0 + modified_time = datetime.datetime.fromtimestamp( + timestamp_millis, datetime.timezone.utc + ) if modified_time and modified_time >= max_bk_value: max_bk_value = modified_time if not modified_time or modified_time >= start: record = bumble_bee.transform(row, schema, mdata) - singer.write_record("deals", record, catalog.get('stream_alias'), time_extracted=utils.now()) - - STATE = singer.write_bookmark(STATE, 'deals', bookmark_key, utils.strftime(max_bk_value)) + singer.write_record( + "deals", + record, + catalog.get("stream_alias"), + time_extracted=utils.now(), + ) + + STATE = singer.write_bookmark( + STATE, "deals", bookmark_key, utils.strftime(max_bk_value) + ) singer.write_state(STATE) return STATE -#NB> no suitable bookmark is available: https://developers.hubspot.com/docs/methods/email/get_campaigns_by_id + +# NB> no suitable bookmark is available: https://developers.hubspot.com/docs/methods/email/get_campaigns_by_id def sync_campaigns(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) - mdata = metadata.to_map(catalog.get('metadata')) + mdata = metadata.to_map(catalog.get("metadata")) schema = load_schema("campaigns") - singer.write_schema("campaigns", schema, ["id"], catalog.get('stream_alias')) + singer.write_schema("campaigns", schema, ["id"], catalog.get("stream_alias")) LOGGER.info("sync_campaigns(NO bookmarks)") url = get_url("campaigns_all") - params = {'limit': 500} + params = {"limit": 500} with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: - for row in gen_request(STATE, 'campaigns', url, params, "campaigns", "hasMore", ["offset"], ["offset"]): - record = request(get_url("campaigns_detail", campaign_id=row['id'])).json() + for row in gen_request( + STATE, + "campaigns", + url, + params, + "campaigns", + "hasMore", + ["offset"], + ["offset"], + ): + record = request(get_url("campaigns_detail", campaign_id=row["id"])).json() record = bumble_bee.transform(record, schema, mdata) - singer.write_record("campaigns", record, catalog.get('stream_alias'), time_extracted=utils.now()) + singer.write_record( + "campaigns", + record, + catalog.get("stream_alias"), + time_extracted=utils.now(), + ) return STATE def sync_entity_chunked(STATE, catalog, entity_name, key_properties, path): schema = load_schema(entity_name) - bookmark_key = 'startTimestamp' + bookmark_key = "startTimestamp" - singer.write_schema(entity_name, schema, key_properties, [bookmark_key], catalog.get('stream_alias')) + singer.write_schema( + entity_name, schema, key_properties, [bookmark_key], catalog.get("stream_alias") + ) start = get_start(STATE, entity_name, bookmark_key) LOGGER.info("sync_%s from %s", entity_name, start) @@ -547,26 +675,22 @@ def sync_entity_chunked(STATE, catalog, entity_name, key_properties, path): start_ts = int(utils.strptime_with_tz(start).timestamp() * 1000) url = get_url(entity_name) - mdata = metadata.to_map(catalog.get('metadata')) + mdata = metadata.to_map(catalog.get("metadata")) - if entity_name == 'email_events': - window_size = int(CONFIG['email_chunk_size']) - elif entity_name == 'subscription_changes': - window_size = int(CONFIG['subscription_chunk_size']) + if entity_name == "email_events": + window_size = int(CONFIG["email_chunk_size"]) + elif entity_name == "subscription_changes": + window_size = int(CONFIG["subscription_chunk_size"]) with metrics.record_counter(entity_name) as counter: while start_ts < now_ts: end_ts = start_ts + window_size - params = { - 'startTimestamp': start_ts, - 'endTimestamp': end_ts, - 'limit': 1000, - } + params = {"startTimestamp": start_ts, "endTimestamp": end_ts, "limit": 1000} with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: while True: our_offset = singer.get_offset(STATE, entity_name) - if bool(our_offset) and our_offset.get('offset') != None: - params[StateFields.offset] = our_offset.get('offset') + if bool(our_offset) and our_offset.get("offset") != None: + params[StateFields.offset] = our_offset.get("offset") data = request(url, params).json() time_extracted = utils.now() @@ -574,18 +698,31 @@ def sync_entity_chunked(STATE, catalog, entity_name, key_properties, path): for row in data[path]: counter.increment() record = bumble_bee.transform(row, schema, mdata) - singer.write_record(entity_name, - record, - catalog.get('stream_alias'), - time_extracted=time_extracted) - if data.get('hasMore'): - STATE = singer.set_offset(STATE, entity_name, 'offset', data['offset']) + singer.write_record( + entity_name, + record, + catalog.get("stream_alias"), + time_extracted=time_extracted, + ) + if data.get("hasMore"): + STATE = singer.set_offset( + STATE, entity_name, "offset", data["offset"] + ) singer.write_state(STATE) else: STATE = singer.clear_offset(STATE, entity_name) singer.write_state(STATE) break - STATE = singer.write_bookmark(STATE, entity_name, 'startTimestamp', utils.strftime(datetime.datetime.fromtimestamp((start_ts / 1000), datetime.timezone.utc ))) # pylint: disable=line-too-long + STATE = singer.write_bookmark( + STATE, + entity_name, + "startTimestamp", + utils.strftime( + datetime.datetime.fromtimestamp( + (start_ts / 1000), datetime.timezone.utc + ) + ), + ) # pylint: disable=line-too-long singer.write_state(STATE) start_ts = end_ts @@ -593,23 +730,33 @@ def sync_entity_chunked(STATE, catalog, entity_name, key_properties, path): singer.write_state(STATE) return STATE + def sync_subscription_changes(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) - STATE = sync_entity_chunked(STATE, catalog, "subscription_changes", ["timestamp", "portalId", "recipient"], - "timeline") + STATE = sync_entity_chunked( + STATE, + catalog, + "subscription_changes", + ["timestamp", "portalId", "recipient"], + "timeline", + ) return STATE + def sync_email_events(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) STATE = sync_entity_chunked(STATE, catalog, "email_events", ["id"], "events") return STATE + def sync_contact_lists(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) - mdata = metadata.to_map(catalog.get('metadata')) + mdata = metadata.to_map(catalog.get("metadata")) schema = load_schema("contact_lists") - bookmark_key = 'updatedAt' - singer.write_schema("contact_lists", schema, ["listId"], [bookmark_key], catalog.get('stream_alias')) + bookmark_key = "updatedAt" + singer.write_schema( + "contact_lists", schema, ["listId"], [bookmark_key], catalog.get("stream_alias") + ) start = get_start(STATE, "contact_lists", bookmark_key) max_bk_value = start @@ -617,28 +764,45 @@ def sync_contact_lists(STATE, ctx): LOGGER.info("sync_contact_lists from %s", start) url = get_url("contact_lists") - params = {'count': 250} + params = {"count": 250} with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: - for row in gen_request(STATE, 'contact_lists', url, params, "lists", "has-more", ["offset"], ["offset"]): + for row in gen_request( + STATE, + "contact_lists", + url, + params, + "lists", + "has-more", + ["offset"], + ["offset"], + ): record = bumble_bee.transform(row, schema, mdata) if record[bookmark_key] >= start: - singer.write_record("contact_lists", record, catalog.get('stream_alias'), time_extracted=utils.now()) + singer.write_record( + "contact_lists", + record, + catalog.get("stream_alias"), + time_extracted=utils.now(), + ) if record[bookmark_key] >= max_bk_value: max_bk_value = record[bookmark_key] - STATE = singer.write_bookmark(STATE, 'contact_lists', bookmark_key, max_bk_value) + STATE = singer.write_bookmark(STATE, "contact_lists", bookmark_key, max_bk_value) singer.write_state(STATE) return STATE + def sync_forms(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) - mdata = metadata.to_map(catalog.get('metadata')) + mdata = metadata.to_map(catalog.get("metadata")) schema = load_schema("forms") - bookmark_key = 'updatedAt' + bookmark_key = "updatedAt" - singer.write_schema("forms", schema, ["guid"], [bookmark_key], catalog.get('stream_alias')) + singer.write_schema( + "forms", schema, ["guid"], [bookmark_key], catalog.get("stream_alias") + ) start = get_start(STATE, "forms", bookmark_key) max_bk_value = start @@ -652,25 +816,33 @@ def sync_forms(STATE, ctx): record = bumble_bee.transform(row, schema, mdata) if record[bookmark_key] >= start: - singer.write_record("forms", record, catalog.get('stream_alias'), time_extracted=time_extracted) + singer.write_record( + "forms", + record, + catalog.get("stream_alias"), + time_extracted=time_extracted, + ) if record[bookmark_key] >= max_bk_value: max_bk_value = record[bookmark_key] - STATE = singer.write_bookmark(STATE, 'forms', bookmark_key, max_bk_value) + STATE = singer.write_bookmark(STATE, "forms", bookmark_key, max_bk_value) singer.write_state(STATE) return STATE + def sync_workflows(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) - mdata = metadata.to_map(catalog.get('metadata')) + mdata = metadata.to_map(catalog.get("metadata")) schema = load_schema("workflows") - bookmark_key = 'updatedAt' - singer.write_schema("workflows", schema, ["id"], [bookmark_key], catalog.get('stream_alias')) + bookmark_key = "updatedAt" + singer.write_schema( + "workflows", schema, ["id"], [bookmark_key], catalog.get("stream_alias") + ) start = get_start(STATE, "workflows", bookmark_key) max_bk_value = start - STATE = singer.write_bookmark(STATE, 'workflows', bookmark_key, max_bk_value) + STATE = singer.write_bookmark(STATE, "workflows", bookmark_key, max_bk_value) singer.write_state(STATE) LOGGER.info("sync_workflows from %s", start) @@ -679,32 +851,40 @@ def sync_workflows(STATE, ctx): time_extracted = utils.now() with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: - for row in data['workflows']: + for row in data["workflows"]: record = bumble_bee.transform(row, schema, mdata) if record[bookmark_key] >= start: - singer.write_record("workflows", record, catalog.get('stream_alias'), time_extracted=time_extracted) + singer.write_record( + "workflows", + record, + catalog.get("stream_alias"), + time_extracted=time_extracted, + ) if record[bookmark_key] >= max_bk_value: max_bk_value = record[bookmark_key] - STATE = singer.write_bookmark(STATE, 'workflows', bookmark_key, max_bk_value) + STATE = singer.write_bookmark(STATE, "workflows", bookmark_key, max_bk_value) singer.write_state(STATE) return STATE + def sync_owners(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) - mdata = metadata.to_map(catalog.get('metadata')) + mdata = metadata.to_map(catalog.get("metadata")) schema = load_schema("owners") - bookmark_key = 'updatedAt' + bookmark_key = "updatedAt" - singer.write_schema("owners", schema, ["ownerId"], [bookmark_key], catalog.get('stream_alias')) + singer.write_schema( + "owners", schema, ["ownerId"], [bookmark_key], catalog.get("stream_alias") + ) start = get_start(STATE, "owners", bookmark_key) max_bk_value = start LOGGER.info("sync_owners from %s", start) params = {} - if CONFIG.get('include_inactives'): - params['includeInactives'] = "true" + if CONFIG.get("include_inactives"): + params["includeInactives"] = "true" data = request(get_url("owners"), params).json() time_extracted = utils.now() @@ -716,18 +896,30 @@ def sync_owners(STATE, ctx): max_bk_value = record[bookmark_key] if record[bookmark_key] >= start: - singer.write_record("owners", record, catalog.get('stream_alias'), time_extracted=time_extracted) - - STATE = singer.write_bookmark(STATE, 'owners', bookmark_key, max_bk_value) + singer.write_record( + "owners", + record, + catalog.get("stream_alias"), + time_extracted=time_extracted, + ) + + STATE = singer.write_bookmark(STATE, "owners", bookmark_key, max_bk_value) singer.write_state(STATE) return STATE + def sync_engagements(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) - mdata = metadata.to_map(catalog.get('metadata')) + mdata = metadata.to_map(catalog.get("metadata")) schema = load_schema("engagements") - bookmark_key = 'lastUpdated' - singer.write_schema("engagements", schema, ["engagement_id"], [bookmark_key], catalog.get('stream_alias')) + bookmark_key = "lastUpdated" + singer.write_schema( + "engagements", + schema, + ["engagement_id"], + [bookmark_key], + catalog.get("stream_alias"), + ) start = get_start(STATE, "engagements", bookmark_key) # Because this stream doesn't query by `lastUpdated`, it cycles @@ -743,48 +935,73 @@ def sync_engagements(STATE, ctx): max_bk_value = start LOGGER.info("sync_engagements from %s", start) - STATE = singer.write_bookmark(STATE, 'engagements', bookmark_key, start) + STATE = singer.write_bookmark(STATE, "engagements", bookmark_key, start) singer.write_state(STATE) url = get_url("engagements_all") - params = {'limit': 250} + params = {"limit": 250} top_level_key = "results" - engagements = gen_request(STATE, 'engagements', url, params, top_level_key, "hasMore", ["offset"], ["offset"]) + engagements = gen_request( + STATE, + "engagements", + url, + params, + top_level_key, + "hasMore", + ["offset"], + ["offset"], + ) time_extracted = utils.now() with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: for engagement in engagements: record = bumble_bee.transform(engagement, schema, mdata) - if record['engagement'][bookmark_key] >= start: + if record["engagement"][bookmark_key] >= start: # hoist PK and bookmark field to top-level record - record['engagement_id'] = record['engagement']['id'] - record[bookmark_key] = record['engagement'][bookmark_key] - singer.write_record("engagements", record, catalog.get('stream_alias'), time_extracted=time_extracted) - if record['engagement'][bookmark_key] >= max_bk_value: - max_bk_value = record['engagement'][bookmark_key] + record["engagement_id"] = record["engagement"]["id"] + record[bookmark_key] = record["engagement"][bookmark_key] + singer.write_record( + "engagements", + record, + catalog.get("stream_alias"), + time_extracted=time_extracted, + ) + if record["engagement"][bookmark_key] >= max_bk_value: + max_bk_value = record["engagement"][bookmark_key] # Don't bookmark past the start of this sync to account for updated records during the sync. new_bookmark = min(utils.strptime_to_utc(max_bk_value), current_sync_start) - STATE = singer.write_bookmark(STATE, 'engagements', bookmark_key, utils.strftime(new_bookmark)) - STATE = write_current_sync_start(STATE, 'engagements', None) + STATE = singer.write_bookmark( + STATE, "engagements", bookmark_key, utils.strftime(new_bookmark) + ) + STATE = write_current_sync_start(STATE, "engagements", None) singer.write_state(STATE) return STATE + def sync_deal_pipelines(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) - mdata = metadata.to_map(catalog.get('metadata')) - schema = load_schema('deal_pipelines') - singer.write_schema('deal_pipelines', schema, ['pipelineId'], catalog.get('stream_alias')) - LOGGER.info('sync_deal_pipelines') - data = request(get_url('deal_pipelines')).json() + mdata = metadata.to_map(catalog.get("metadata")) + schema = load_schema("deal_pipelines") + singer.write_schema( + "deal_pipelines", schema, ["pipelineId"], catalog.get("stream_alias") + ) + LOGGER.info("sync_deal_pipelines") + data = request(get_url("deal_pipelines")).json() with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: for row in data: record = bumble_bee.transform(row, schema, mdata) - singer.write_record("deal_pipelines", record, catalog.get('stream_alias'), time_extracted=utils.now()) + singer.write_record( + "deal_pipelines", + record, + catalog.get("stream_alias"), + time_extracted=utils.now(), + ) singer.write_state(STATE) return STATE + @attr.s class Stream(object): tap_stream_id = attr.ib() @@ -793,37 +1010,51 @@ class Stream(object): replication_key = attr.ib() replication_method = attr.ib() + STREAMS = [ # Do these first as they are incremental - Stream('subscription_changes', sync_subscription_changes, ['timestamp', 'portalId', 'recipient'], 'startTimestamp', 'INCREMENTAL'), - Stream('email_events', sync_email_events, ['id'], 'startTimestamp', 'INCREMENTAL'), - + Stream( + "subscription_changes", + sync_subscription_changes, + ["timestamp", "portalId", "recipient"], + "startTimestamp", + "INCREMENTAL", + ), + Stream("email_events", sync_email_events, ["id"], "startTimestamp", "INCREMENTAL"), # Do these last as they are full table - Stream('forms', sync_forms, ['guid'], 'updatedAt', 'FULL_TABLE'), - Stream('workflows', sync_workflows, ['id'], 'updatedAt', 'FULL_TABLE'), - Stream('owners', sync_owners, ["ownerId"], 'updatedAt', 'FULL_TABLE'), - Stream('campaigns', sync_campaigns, ["id"], None, 'FULL_TABLE'), - Stream('contact_lists', sync_contact_lists, ["listId"], 'updatedAt', 'FULL_TABLE'), - Stream('contacts', sync_contacts, ["vid"], 'versionTimestamp', 'FULL_TABLE'), - Stream('companies', sync_companies, ["companyId"], 'hs_lastmodifieddate', 'FULL_TABLE'), - Stream('deals', sync_deals, ["dealId"], 'hs_lastmodifieddate', 'FULL_TABLE'), - Stream('deal_pipelines', sync_deal_pipelines, ['pipelineId'], None, 'FULL_TABLE'), - Stream('engagements', sync_engagements, ["engagement_id"], 'lastUpdated', 'FULL_TABLE') + Stream("forms", sync_forms, ["guid"], "updatedAt", "FULL_TABLE"), + Stream("workflows", sync_workflows, ["id"], "updatedAt", "FULL_TABLE"), + Stream("owners", sync_owners, ["ownerId"], "updatedAt", "FULL_TABLE"), + Stream("campaigns", sync_campaigns, ["id"], None, "FULL_TABLE"), + Stream("contact_lists", sync_contact_lists, ["listId"], "updatedAt", "FULL_TABLE"), + Stream("contacts", sync_contacts, ["vid"], "versionTimestamp", "FULL_TABLE"), + Stream( + "companies", sync_companies, ["companyId"], "hs_lastmodifieddate", "FULL_TABLE" + ), + Stream("deals", sync_deals, ["dealId"], "hs_lastmodifieddate", "FULL_TABLE"), + Stream("deal_pipelines", sync_deal_pipelines, ["pipelineId"], None, "FULL_TABLE"), + Stream( + "engagements", sync_engagements, ["engagement_id"], "lastUpdated", "FULL_TABLE" + ), ] + def get_streams_to_sync(streams, state): target_stream = singer.get_currently_syncing(state) result = streams if target_stream: - skipped = list(itertools.takewhile( - lambda x: x.tap_stream_id != target_stream, streams)) - rest = list(itertools.dropwhile( - lambda x: x.tap_stream_id != target_stream, streams)) - result = rest + skipped # Move skipped streams to end + skipped = list( + itertools.takewhile(lambda x: x.tap_stream_id != target_stream, streams) + ) + rest = list( + itertools.dropwhile(lambda x: x.tap_stream_id != target_stream, streams) + ) + result = rest + skipped # Move skipped streams to end if not result: - raise Exception('Unknown stream {} in state'.format(target_stream)) + raise Exception("Unknown stream {} in state".format(target_stream)) return result + def get_selected_streams(remaining_streams, ctx): selected_streams = [] for stream in remaining_streams: @@ -831,6 +1062,7 @@ def get_selected_streams(remaining_streams, ctx): selected_streams.append(stream) return selected_streams + def do_sync(STATE, catalog): # Clear out keys that are no longer used clean_state(STATE) @@ -840,17 +1072,19 @@ def do_sync(STATE, catalog): remaining_streams = get_streams_to_sync(STREAMS, STATE) selected_streams = get_selected_streams(remaining_streams, ctx) - LOGGER.info('Starting sync. Will sync these streams: %s', - [stream.tap_stream_id for stream in selected_streams]) + LOGGER.info( + "Starting sync. Will sync these streams: %s", + [stream.tap_stream_id for stream in selected_streams], + ) for stream in selected_streams: - LOGGER.info('Syncing %s', stream.tap_stream_id) + LOGGER.info("Syncing %s", stream.tap_stream_id) STATE = singer.set_currently_syncing(STATE, stream.tap_stream_id) singer.write_state(STATE) try: - STATE = stream.sync(STATE, ctx) # pylint: disable=not-callable + STATE = stream.sync(STATE, ctx) # pylint: disable=not-callable except SourceUnavailableException as ex: - error_message = str(ex).replace(CONFIG['access_token'], 10 * '*') + error_message = str(ex).replace(CONFIG["access_token"], 10 * "*") LOGGER.error(error_message) pass @@ -858,91 +1092,120 @@ def do_sync(STATE, catalog): singer.write_state(STATE) LOGGER.info("Sync completed") + class Context(object): def __init__(self, catalog): self.selected_stream_ids = set() - for stream in catalog.get('streams'): - mdata = metadata.to_map(stream['metadata']) - if metadata.get(mdata, (), 'selected'): - self.selected_stream_ids.add(stream['tap_stream_id']) + for stream in catalog.get("streams"): + mdata = metadata.to_map(stream["metadata"]) + if metadata.get(mdata, (), "selected"): + self.selected_stream_ids.add(stream["tap_stream_id"]) self.catalog = catalog - def get_catalog_from_id(self,tap_stream_id): - return [c for c in self.catalog.get('streams') - if c.get('stream') == tap_stream_id][0] + def get_catalog_from_id(self, tap_stream_id): + return [ + c for c in self.catalog.get("streams") if c.get("stream") == tap_stream_id + ][0] + # stream a is dependent on stream STREAM_DEPENDENCIES[a] -STREAM_DEPENDENCIES = { - CONTACTS_BY_COMPANY: 'companies' -} +STREAM_DEPENDENCIES = {CONTACTS_BY_COMPANY: "companies"} + def validate_dependencies(ctx): errs = [] - msg_tmpl = ("Unable to extract {0} data. " - "To receive {0} data, you also need to select {1}.") + msg_tmpl = ( + "Unable to extract {0} data. " + "To receive {0} data, you also need to select {1}." + ) - for k,v in STREAM_DEPENDENCIES.items(): + for k, v in STREAM_DEPENDENCIES.items(): if k in ctx.selected_stream_ids and v not in ctx.selected_stream_ids: errs.append(msg_tmpl.format(k, v)) if errs: raise DependencyException(" ".join(errs)) + def load_discovered_schema(stream): schema = load_schema(stream.tap_stream_id) mdata = metadata.new() - mdata = metadata.write(mdata, (), 'table-key-properties', stream.key_properties) - mdata = metadata.write(mdata, (), 'forced-replication-method', stream.replication_method) + mdata = metadata.write(mdata, (), "table-key-properties", stream.key_properties) + mdata = metadata.write( + mdata, (), "forced-replication-method", stream.replication_method + ) if stream.replication_key: - mdata = metadata.write(mdata, (), 'valid-replication-keys', [stream.replication_key]) + mdata = metadata.write( + mdata, (), "valid-replication-keys", [stream.replication_key] + ) - for field_name, props in schema['properties'].items(): + for field_name, props in schema["properties"].items(): if field_name in stream.key_properties or field_name == stream.replication_key: - mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'automatic') + mdata = metadata.write( + mdata, ("properties", field_name), "inclusion", "automatic" + ) else: - mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'available') + mdata = metadata.write( + mdata, ("properties", field_name), "inclusion", "available" + ) # The engagements stream has nested data that we synthesize; The engagement field needs to be automatic if stream.tap_stream_id == "engagements": - mdata = metadata.write(mdata, ('properties', 'engagement'), 'inclusion', 'automatic') + mdata = metadata.write( + mdata, ("properties", "engagement"), "inclusion", "automatic" + ) return schema, metadata.to_list(mdata) + def discover_schemas(): - result = {'streams': []} + result = {"streams": []} for stream in STREAMS: - LOGGER.info('Loading schema for %s', stream.tap_stream_id) + LOGGER.info("Loading schema for %s", stream.tap_stream_id) schema, mdata = load_discovered_schema(stream) - result['streams'].append({'stream': stream.tap_stream_id, - 'tap_stream_id': stream.tap_stream_id, - 'schema': schema, - 'metadata': mdata}) + result["streams"].append( + { + "stream": stream.tap_stream_id, + "tap_stream_id": stream.tap_stream_id, + "schema": schema, + "metadata": mdata, + } + ) # Load the contacts_by_company schema - LOGGER.info('Loading schema for contacts_by_company') - contacts_by_company = Stream('contacts_by_company', _sync_contacts_by_company, ['company-id', 'contact-id'], None, 'FULL_TABLE') + LOGGER.info("Loading schema for contacts_by_company") + contacts_by_company = Stream( + "contacts_by_company", + _sync_contacts_by_company, + ["company-id", "contact-id"], + None, + "FULL_TABLE", + ) schema, mdata = load_discovered_schema(contacts_by_company) - result['streams'].append({'stream': CONTACTS_BY_COMPANY, - 'tap_stream_id': CONTACTS_BY_COMPANY, - 'schema': schema, - 'metadata': mdata}) + result["streams"].append( + { + "stream": CONTACTS_BY_COMPANY, + "tap_stream_id": CONTACTS_BY_COMPANY, + "schema": schema, + "metadata": mdata, + } + ) return result + def do_discover(): - LOGGER.info('Loading schemas') + LOGGER.info("Loading schemas") json.dump(discover_schemas(), sys.stdout, indent=4) + def main_impl(): args = utils.parse_args( - ["redirect_uri", - "client_id", - "client_secret", - "refresh_token", - "start_date"]) + ["redirect_uri", "client_id", "client_secret", "refresh_token", "start_date"] + ) CONFIG.update(args.config) STATE = {} @@ -957,6 +1220,7 @@ def main_impl(): else: LOGGER.info("No properties were selected") + def main(): try: main_impl() @@ -964,5 +1228,6 @@ def main(): LOGGER.critical(exc) raise exc -if __name__ == '__main__': + +if __name__ == "__main__": main() From 6e83a498710cfb645c1dee696259293ce10dfca8 Mon Sep 17 00:00:00 2001 From: "Patrick-Ranjit D. Madsen" Date: Mon, 6 Jan 2020 14:45:35 +0100 Subject: [PATCH 13/78] remove string as an option for numbers --- tap_hubspot/__init__.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tap_hubspot/__init__.py b/tap_hubspot/__init__.py index 09f8bb8e..a0fb96f7 100644 --- a/tap_hubspot/__init__.py +++ b/tap_hubspot/__init__.py @@ -141,9 +141,7 @@ def get_field_type_schema(field_type): return {"type": ["null", "string"], "format": "date-time"} elif field_type == "number": - # A value like 'N/A' can be returned for this type, - # so we have to let this be a string sometimes - return {"type": ["null", "number", "string"]} + return {"type": ["null", "number"]} else: return {"type": ["null", "string"]} From 929f4b1e4f94ffaaac151f5a84bb4472add083d1 Mon Sep 17 00:00:00 2001 From: "Patrick-Ranjit D. Madsen" Date: Mon, 6 Jan 2020 14:56:51 +0100 Subject: [PATCH 14/78] add function to remove all N/A from an object --- tap_hubspot/__init__.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/tap_hubspot/__init__.py b/tap_hubspot/__init__.py index a0fb96f7..f636e8cd 100644 --- a/tap_hubspot/__init__.py +++ b/tap_hubspot/__init__.py @@ -132,6 +132,42 @@ def get_url(endpoint, **kwargs): return BASE_URL + ENDPOINTS[endpoint].format(**kwargs) +def replace_na_with_none(obj): + '''Given a certain object, the function will replace any 'N/A' values with None. + E.g: object = { + "key1" : [{"subkey1": "value1"}, {"subkey2": "N/A"}], + "key2" : "n/a", + "key3" : { + "subkey3" : "n/a", + "subkey4" : "value2" + } + } + self.replace_na_with_none(object) will return: + { + "key1" : [{"subkey1": "value1"}, {"subkey2": None}], + "key2" : None, + "key3" : { + "subkey3" : None, + "subkey4" : "value2" + } + } + ''' + if isinstance(obj, dict): + new_dict = {} + for key, value in obj.items(): + new_dict[key] = replace_na_with_none(value) + return new_dict + + if isinstance(obj, list): + new_list = [] + for value in obj: + new_list.append(replace_na_with_none(value)) + return new_list + + if isinstance(obj, str): + if obj.lower() == 'n/a': + obj = None + return obj def get_field_type_schema(field_type): if field_type == "bool": From 63087ec382302405bc86e7fbed41309bbd9660e3 Mon Sep 17 00:00:00 2001 From: "Patrick-Ranjit D. Madsen" Date: Mon, 6 Jan 2020 14:57:15 +0100 Subject: [PATCH 15/78] remove N/A from all records/rows when sync'ing --- tap_hubspot/__init__.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tap_hubspot/__init__.py b/tap_hubspot/__init__.py index f636e8cd..ac15b6a9 100644 --- a/tap_hubspot/__init__.py +++ b/tap_hubspot/__init__.py @@ -394,6 +394,7 @@ def _sync_contact_vids(catalog, vids, schema, bumble_bee): mdata = metadata.to_map(catalog.get("metadata")) for record in data.values(): + record = replace_na_with_none(record) record = bumble_bee.transform(record, schema, mdata) singer.write_record( "contacts", @@ -488,6 +489,7 @@ def _sync_contacts_by_company(STATE, ctx, company_id): data = request(url, default_contacts_by_company_params).json() for row in data[path]: counter.increment() + row = replace_na_with_none(row) record = {"company_id": company_id, "contact_id": row} record = bumble_bee.transform(record, schema, mdata) singer.write_record( @@ -681,6 +683,7 @@ def sync_campaigns(STATE, ctx): ["offset"], ): record = request(get_url("campaigns_detail", campaign_id=row["id"])).json() + record = replace_na_with_none(record) record = bumble_bee.transform(record, schema, mdata) singer.write_record( "campaigns", @@ -731,6 +734,7 @@ def sync_entity_chunked(STATE, catalog, entity_name, key_properties, path): for row in data[path]: counter.increment() + row = replace_na_with_none(row) record = bumble_bee.transform(row, schema, mdata) singer.write_record( entity_name, @@ -810,6 +814,7 @@ def sync_contact_lists(STATE, ctx): ["offset"], ["offset"], ): + row = replace_na_with_none(row) record = bumble_bee.transform(row, schema, mdata) if record[bookmark_key] >= start: @@ -847,6 +852,7 @@ def sync_forms(STATE, ctx): with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: for row in data: + row = replace_na_with_none(row) record = bumble_bee.transform(row, schema, mdata) if record[bookmark_key] >= start: @@ -886,6 +892,7 @@ def sync_workflows(STATE, ctx): with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: for row in data["workflows"]: + row = replace_na_with_none(row) record = bumble_bee.transform(row, schema, mdata) if record[bookmark_key] >= start: singer.write_record( @@ -925,6 +932,7 @@ def sync_owners(STATE, ctx): with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: for row in data: + row = replace_na_with_none(row) record = bumble_bee.transform(row, schema, mdata) if record[bookmark_key] >= max_bk_value: max_bk_value = record[bookmark_key] @@ -990,6 +998,7 @@ def sync_engagements(STATE, ctx): with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: for engagement in engagements: + engagement = replace_na_with_none(engagement) record = bumble_bee.transform(engagement, schema, mdata) if record["engagement"][bookmark_key] >= start: # hoist PK and bookmark field to top-level record @@ -1025,6 +1034,7 @@ def sync_deal_pipelines(STATE, ctx): data = request(get_url("deal_pipelines")).json() with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: for row in data: + row = replace_na_with_none(row) record = bumble_bee.transform(row, schema, mdata) singer.write_record( "deal_pipelines", From 204de23f43a6c9e5a43d9f9ec5e2860c0f179ecd Mon Sep 17 00:00:00 2001 From: JingLinDaisy Date: Wed, 8 Jan 2020 10:12:46 +0100 Subject: [PATCH 16/78] black commit --- tap_hubspot/__init__.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tap_hubspot/__init__.py b/tap_hubspot/__init__.py index ac15b6a9..a9dc2289 100644 --- a/tap_hubspot/__init__.py +++ b/tap_hubspot/__init__.py @@ -132,8 +132,9 @@ def get_url(endpoint, **kwargs): return BASE_URL + ENDPOINTS[endpoint].format(**kwargs) + def replace_na_with_none(obj): - '''Given a certain object, the function will replace any 'N/A' values with None. + """Given a certain object, the function will replace any 'N/A' values with None. E.g: object = { "key1" : [{"subkey1": "value1"}, {"subkey2": "N/A"}], "key2" : "n/a", @@ -151,7 +152,7 @@ def replace_na_with_none(obj): "subkey4" : "value2" } } - ''' + """ if isinstance(obj, dict): new_dict = {} for key, value in obj.items(): @@ -165,10 +166,11 @@ def replace_na_with_none(obj): return new_list if isinstance(obj, str): - if obj.lower() == 'n/a': + if obj.lower() == "n/a": obj = None return obj + def get_field_type_schema(field_type): if field_type == "bool": return {"type": ["null", "boolean"]} From 6804118bcc219dda1d66b4c9ffb71dd42bd538a3 Mon Sep 17 00:00:00 2001 From: JingLinDaisy Date: Wed, 8 Jan 2020 10:20:59 +0100 Subject: [PATCH 17/78] change schema dash to underscore --- tap_hubspot/__init__.py | 45 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/tap_hubspot/__init__.py b/tap_hubspot/__init__.py index a9dc2289..73c2d914 100644 --- a/tap_hubspot/__init__.py +++ b/tap_hubspot/__init__.py @@ -244,7 +244,50 @@ def load_schema(entity_name): if entity_name == "contacts": schema["properties"]["associated-company"] = load_associated_company_schema() - return schema + return schema_nodash(schema) + + +def schema_nodash(obj): + type_field = obj.get("type") + type = get_type(type_field) + if not type: + return obj + if not type in ["array", "object"]: + return obj + if "object" == type: + props = obj.get("properties", {}) + new_props = replace_props(props) + obj["properties"] = new_props + if "array" == type: + items = obj.get("items", {}) + obj["items"] = schema_nodash(items) + return obj + + +def get_type(type_field): + if isinstance(type_field, str): + return type_field + if isinstance(type_field, list): + types = set(type_field) + if "null" in types: + types.remove("null") + return types.pop() + return None + + +def replace_props(props): + if not props: + return props + keys = list(props.keys()) + for k in keys: + if not "-" in k: + props[k] = schema_nodash(props[k]) + else: + v = props.pop(k) + new_key = k.replace("-", "_") + new_value = schema_nodash(v) + props[new_key] = new_value + return props # pylint: disable=invalid-name From 359eed1613e97bbf6e81ba90233e9fedbe82231f Mon Sep 17 00:00:00 2001 From: JingLinDaisy Date: Wed, 8 Jan 2020 10:26:12 +0100 Subject: [PATCH 18/78] change record dash to underscore --- tap_hubspot/__init__.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tap_hubspot/__init__.py b/tap_hubspot/__init__.py index 73c2d914..c42ffc68 100644 --- a/tap_hubspot/__init__.py +++ b/tap_hubspot/__init__.py @@ -441,6 +441,7 @@ def _sync_contact_vids(catalog, vids, schema, bumble_bee): for record in data.values(): record = replace_na_with_none(record) record = bumble_bee.transform(record, schema, mdata) + record = record_nodash(record) singer.write_record( "contacts", record, @@ -449,6 +450,23 @@ def _sync_contact_vids(catalog, vids, schema, bumble_bee): ) +def record_nodash(obj): + if not isinstance(obj, dict): # stopplesing criteria + return obj + + for k in obj.keys(): + value = record_nodash(obj[k]) + if not "-" in k: + key = k + else: + obj.pop(k) + key = k.replace("-", "_") + + obj[key] = value # recursion + + return obj + + default_contact_params = { "showListMemberships": True, "includeVersion": True, From 7e6d588f64765b040bf99e13ed6b81067615f5fb Mon Sep 17 00:00:00 2001 From: "Patrick-Ranjit D. Madsen" Date: Wed, 8 Jan 2020 14:47:37 +0100 Subject: [PATCH 19/78] black commit --- setup.py | 69 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 35 insertions(+), 34 deletions(-) diff --git a/setup.py b/setup.py index 60c48295..541abe12 100644 --- a/setup.py +++ b/setup.py @@ -2,40 +2,41 @@ from setuptools import setup -setup(name='tap-hubspot', - version='2.6.4', - description='Singer.io tap for extracting data from the HubSpot API', - author='Stitch', - url='http://singer.io', - classifiers=['Programming Language :: Python :: 3 :: Only'], - py_modules=['tap_hubspot'], - install_requires=[ - 'attrs==16.3.0', - 'singer-python==5.1.1', - 'requests==2.20.0', - 'backoff==1.3.2', - 'requests_mock==1.3.0', - 'nose' - ], - entry_points=''' +setup( + name="tap-hubspot", + version="2.6.4", + description="Singer.io tap for extracting data from the HubSpot API", + author="Stitch", + url="http://singer.io", + classifiers=["Programming Language :: Python :: 3 :: Only"], + py_modules=["tap_hubspot"], + install_requires=[ + "attrs==16.3.0", + "singer-python==5.1.1", + "requests==2.20.0", + "backoff==1.3.2", + "requests_mock==1.3.0", + "nose", + ], + entry_points=""" [console_scripts] tap-hubspot=tap_hubspot:main - ''', - packages=['tap_hubspot'], - package_data = { - 'tap_hubspot/schemas': [ - "campaigns.json", - "companies.json", - "contact_lists.json", - "contacts.json", - "deals.json", - "email_events.json", - "forms.json", - "keywords.json", - "owners.json", - "subscription_changes.json", - "workflows.json", - ], - }, - include_package_data=True, + """, + packages=["tap_hubspot"], + package_data={ + "tap_hubspot/schemas": [ + "campaigns.json", + "companies.json", + "contact_lists.json", + "contacts.json", + "deals.json", + "email_events.json", + "forms.json", + "keywords.json", + "owners.json", + "subscription_changes.json", + "workflows.json", + ], + }, + include_package_data=True, ) From ddf0bc32b583c7b3ca9b78865cee3f639657bbf3 Mon Sep 17 00:00:00 2001 From: "Patrick-Ranjit D. Madsen" Date: Wed, 8 Jan 2020 14:48:56 +0100 Subject: [PATCH 20/78] bump dependencies --- setup.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index 541abe12..851f6708 100644 --- a/setup.py +++ b/setup.py @@ -11,10 +11,10 @@ classifiers=["Programming Language :: Python :: 3 :: Only"], py_modules=["tap_hubspot"], install_requires=[ - "attrs==16.3.0", - "singer-python==5.1.1", - "requests==2.20.0", - "backoff==1.3.2", + "attrs>=16.3.0, <19", + "singer-python>=5.1.1, <5.9", + "requests==2.22.0", + "backoff>=1.3.2, <2", "requests_mock==1.3.0", "nose", ], From c70b97f6c12d8ed3b54519685e0386f68cae47eb Mon Sep 17 00:00:00 2001 From: "Patrick-Ranjit D. Madsen" Date: Thu, 16 Jan 2020 10:35:51 +0100 Subject: [PATCH 21/78] lift schema from catalog in contacts --- tap_hubspot/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tap_hubspot/__init__.py b/tap_hubspot/__init__.py index c42ffc68..a67febe2 100644 --- a/tap_hubspot/__init__.py +++ b/tap_hubspot/__init__.py @@ -481,7 +481,7 @@ def sync_contacts(STATE, ctx): LOGGER.info("sync_contacts from %s", start) max_bk_value = start - schema = load_schema("contacts") + schema = catalog["schema"] singer.write_schema( "contacts", schema, ["vid"], [bookmark_key], catalog.get("stream_alias") From 1ba0193671e9c817cd8fc2575054b2bb5b3e21b3 Mon Sep 17 00:00:00 2001 From: "Patrick-Ranjit D. Madsen" Date: Thu, 16 Jan 2020 10:37:22 +0100 Subject: [PATCH 22/78] use catalog schema in contacts_by_companies --- tap_hubspot/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tap_hubspot/__init__.py b/tap_hubspot/__init__.py index a67febe2..63e75bea 100644 --- a/tap_hubspot/__init__.py +++ b/tap_hubspot/__init__.py @@ -542,8 +542,8 @@ def use_recent_companies_endpoint(response): # NB> to do: support stream aliasing and field selection def _sync_contacts_by_company(STATE, ctx, company_id): - schema = load_schema(CONTACTS_BY_COMPANY) catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) + schema = catalog.get["schema"] mdata = metadata.to_map(catalog.get("metadata")) url = get_url("contacts_by_company", company_id=company_id) path = "vids" From 9881146f33dcfeae512e78713a35d71ec5451ec8 Mon Sep 17 00:00:00 2001 From: "Patrick-Ranjit D. Madsen" Date: Thu, 16 Jan 2020 10:38:00 +0100 Subject: [PATCH 23/78] catalog.. schemas.. you get the gist --- tap_hubspot/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tap_hubspot/__init__.py b/tap_hubspot/__init__.py index 63e75bea..7a948b39 100644 --- a/tap_hubspot/__init__.py +++ b/tap_hubspot/__init__.py @@ -582,7 +582,7 @@ def sync_companies(STATE, ctx): bookmark_key = "hs_lastmodifieddate" start = utils.strptime_to_utc(get_start(STATE, "companies", bookmark_key)) LOGGER.info("sync_companies from %s", start) - schema = load_schema("companies") + schema = catalog["schema"] singer.write_schema( "companies", schema, ["companyId"], [bookmark_key], catalog.get("stream_alias") ) From 51ca8b5efd373f7c7f3ed621260fcf112f641cc5 Mon Sep 17 00:00:00 2001 From: "Patrick-Ranjit D. Madsen" Date: Thu, 16 Jan 2020 10:40:33 +0100 Subject: [PATCH 24/78] remove unused variables --- tap_hubspot/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tap_hubspot/__init__.py b/tap_hubspot/__init__.py index 7a948b39..729e1de7 100644 --- a/tap_hubspot/__init__.py +++ b/tap_hubspot/__init__.py @@ -662,7 +662,6 @@ def sync_deals(STATE, ctx): start = utils.strptime_with_tz(get_start(STATE, "deals", bookmark_key)) max_bk_value = start LOGGER.info("sync_deals from %s", start) - most_recent_modified_time = start params = {"count": 250, "includeAssociations": False, "properties": []} schema = load_schema("deals") @@ -1249,7 +1248,7 @@ def load_discovered_schema(stream): mdata, (), "valid-replication-keys", [stream.replication_key] ) - for field_name, props in schema["properties"].items(): + for field_name in schema["properties"]: if field_name in stream.key_properties or field_name == stream.replication_key: mdata = metadata.write( mdata, ("properties", field_name), "inclusion", "automatic" From 0c68b3df0ee80ab79ec64775d6cf562159630810 Mon Sep 17 00:00:00 2001 From: "Patrick-Ranjit D. Madsen" Date: Thu, 16 Jan 2020 10:44:50 +0100 Subject: [PATCH 25/78] campaigns catalog schema --- tap_hubspot/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tap_hubspot/__init__.py b/tap_hubspot/__init__.py index 729e1de7..5315d5cd 100644 --- a/tap_hubspot/__init__.py +++ b/tap_hubspot/__init__.py @@ -727,7 +727,7 @@ def sync_deals(STATE, ctx): def sync_campaigns(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) mdata = metadata.to_map(catalog.get("metadata")) - schema = load_schema("campaigns") + schema = catalog["schema"] singer.write_schema("campaigns", schema, ["id"], catalog.get("stream_alias")) LOGGER.info("sync_campaigns(NO bookmarks)") url = get_url("campaigns_all") From 107af7dc3dd6647ea10a5773e65b58a31a154705 Mon Sep 17 00:00:00 2001 From: "Patrick-Ranjit D. Madsen" Date: Thu, 16 Jan 2020 10:45:44 +0100 Subject: [PATCH 26/78] sync_entity_chunked schema --- tap_hubspot/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tap_hubspot/__init__.py b/tap_hubspot/__init__.py index 5315d5cd..cb820f70 100644 --- a/tap_hubspot/__init__.py +++ b/tap_hubspot/__init__.py @@ -758,7 +758,7 @@ def sync_campaigns(STATE, ctx): def sync_entity_chunked(STATE, catalog, entity_name, key_properties, path): - schema = load_schema(entity_name) + schema = catalog["schema"] bookmark_key = "startTimestamp" singer.write_schema( From 21799738be2b70352da0a503e69ebe76c2cd5172 Mon Sep 17 00:00:00 2001 From: "Patrick-Ranjit D. Madsen" Date: Thu, 16 Jan 2020 10:46:19 +0100 Subject: [PATCH 27/78] workflows schema from catalog --- tap_hubspot/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tap_hubspot/__init__.py b/tap_hubspot/__init__.py index cb820f70..e73e7e86 100644 --- a/tap_hubspot/__init__.py +++ b/tap_hubspot/__init__.py @@ -936,7 +936,7 @@ def sync_forms(STATE, ctx): def sync_workflows(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) mdata = metadata.to_map(catalog.get("metadata")) - schema = load_schema("workflows") + schema = catalog["schema"] bookmark_key = "updatedAt" singer.write_schema( "workflows", schema, ["id"], [bookmark_key], catalog.get("stream_alias") From 86e85954da94e737d8f863d116b4a913852d9496 Mon Sep 17 00:00:00 2001 From: "Patrick-Ranjit D. Madsen" Date: Thu, 16 Jan 2020 10:46:36 +0100 Subject: [PATCH 28/78] owners schema change --- tap_hubspot/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tap_hubspot/__init__.py b/tap_hubspot/__init__.py index e73e7e86..3ba85802 100644 --- a/tap_hubspot/__init__.py +++ b/tap_hubspot/__init__.py @@ -974,7 +974,7 @@ def sync_workflows(STATE, ctx): def sync_owners(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) mdata = metadata.to_map(catalog.get("metadata")) - schema = load_schema("owners") + schema = catalog["schema"] bookmark_key = "updatedAt" singer.write_schema( From 416c2d52480a19f383ce9e5b206285a5bb6b0df6 Mon Sep 17 00:00:00 2001 From: "Patrick-Ranjit D. Madsen" Date: Thu, 16 Jan 2020 10:46:55 +0100 Subject: [PATCH 29/78] engagements... schema --- tap_hubspot/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tap_hubspot/__init__.py b/tap_hubspot/__init__.py index 3ba85802..b6554e42 100644 --- a/tap_hubspot/__init__.py +++ b/tap_hubspot/__init__.py @@ -1015,7 +1015,7 @@ def sync_owners(STATE, ctx): def sync_engagements(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) mdata = metadata.to_map(catalog.get("metadata")) - schema = load_schema("engagements") + schema = catalog["schema"] bookmark_key = "lastUpdated" singer.write_schema( "engagements", From b5561f350cb36243826cde4029c801363a15ae8b Mon Sep 17 00:00:00 2001 From: "Patrick-Ranjit D. Madsen" Date: Thu, 16 Jan 2020 10:47:10 +0100 Subject: [PATCH 30/78] deal pipeline --- tap_hubspot/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tap_hubspot/__init__.py b/tap_hubspot/__init__.py index b6554e42..581a243f 100644 --- a/tap_hubspot/__init__.py +++ b/tap_hubspot/__init__.py @@ -1088,7 +1088,7 @@ def sync_engagements(STATE, ctx): def sync_deal_pipelines(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) mdata = metadata.to_map(catalog.get("metadata")) - schema = load_schema("deal_pipelines") + schema = catalog["schema"] singer.write_schema( "deal_pipelines", schema, ["pipelineId"], catalog.get("stream_alias") ) From a4cc5c277b09004fadfa40183d05629496fba909 Mon Sep 17 00:00:00 2001 From: JingLin0 Date: Tue, 18 Feb 2020 22:54:28 +0100 Subject: [PATCH 31/78] prettier commit --- tap_hubspot/schemas/contacts.json | 108 +++++++++++++++--------------- 1 file changed, 54 insertions(+), 54 deletions(-) diff --git a/tap_hubspot/schemas/contacts.json b/tap_hubspot/schemas/contacts.json index 1c41b8cf..f61a97b5 100644 --- a/tap_hubspot/schemas/contacts.json +++ b/tap_hubspot/schemas/contacts.json @@ -25,70 +25,70 @@ "profile-url": { "type": ["null", "string"] }, - "associated-company" : { - "type": ["null", "object"], - "properties" : {} + "associated-company": { + "type": ["null", "object"], + "properties": {} }, "identity-profiles": { - "type": ["null", "array"], - "items": { - "type": ["null", "object"], - "properties": { - "deleted-changed-timestamp": { - "type": ["null", "string"], - "format": "date-time" - }, - "saved-at-timestamp": { - "type": ["null", "string"], - "format": "date-time" + "type": ["null", "array"], + "items": { + "type": ["null", "object"], + "properties": { + "deleted-changed-timestamp": { + "type": ["null", "string"], + "format": "date-time" + }, + "saved-at-timestamp": { + "type": ["null", "string"], + "format": "date-time" + }, + "vid": { + "type": ["null", "integer"] + }, + "identities": { + "type": ["null", "array"], + "items": { + "type": ["null", "object"], + "properties": { + "timestamp": { + "type": ["null", "string"], + "format": "date-time" }, - "vid": { - "type": ["null", "integer"] + "type": { + "type": ["null", "string"] }, - "identities": { - "type": ["null", "array"], - "items": { - "type": ["null", "object"], - "properties": { - "timestamp": { - "type": ["null", "string"], - "format": "date-time" - }, - "type": { - "type": ["null", "string"] - }, - "value": { - "type": ["null", "string"] - } - } - } + "value": { + "type": ["null", "string"] } + } } + } } + } }, "list-memberships": { - "type": ["null", "array"], - "items": { - "type": ["null", "object"], - "properties": { - "internal-list-id": { - "type": ["null", "integer"] - }, - "is-member": { - "type": ["null", "boolean"] - }, - "static-list-id": { - "type": ["null", "integer"] - }, - "timestamp": { - "type": ["null", "string"], - "format": "date-time" - }, - "vid": { - "type": ["null", "integer"] - } - } + "type": ["null", "array"], + "items": { + "type": ["null", "object"], + "properties": { + "internal-list-id": { + "type": ["null", "integer"] + }, + "is-member": { + "type": ["null", "boolean"] + }, + "static-list-id": { + "type": ["null", "integer"] + }, + "timestamp": { + "type": ["null", "string"], + "format": "date-time" + }, + "vid": { + "type": ["null", "integer"] + } } + } }, "form-submissions": { "type": ["null", "array"], From 23cf922bf3e4f9bd936b4e85fbe649eea5510379 Mon Sep 17 00:00:00 2001 From: JingLin0 Date: Tue, 18 Feb 2020 23:49:11 +0100 Subject: [PATCH 32/78] only include fields that are needed --- tap_hubspot/schemas/companies.json | 118 ++++++++++++++- tap_hubspot/schemas/contacts.json | 210 ++++---------------------- tap_hubspot/schemas/deals.json | 97 ++++++++++++ tap_hubspot/schemas/email_events.json | 91 ----------- 4 files changed, 242 insertions(+), 274 deletions(-) diff --git a/tap_hubspot/schemas/companies.json b/tap_hubspot/schemas/companies.json index 286f249d..06982bee 100644 --- a/tap_hubspot/schemas/companies.json +++ b/tap_hubspot/schemas/companies.json @@ -1,11 +1,123 @@ { "type": "object", "properties": { - "portalId": { - "type": ["null", "integer"] - }, "companyId": { "type": ["null", "integer"] + }, + "properties": { + "type": "object", + "properties": { + "name": { + "type": "object", + "properties": { + "value": { + "type": ["null", "string"] + }, + "timestamp": { + "type": ["null", "string"], + "format": "date-time" + }, + "source": { + "type": ["null", "string"] + }, + "sourceId": { + "type": ["null", "string"] + } + } + }, + "country": { + "type": "object", + "properties": { + "value": { + "type": ["null", "string"] + }, + "timestamp": { + "type": ["null", "string"], + "format": "date-time" + }, + "source": { + "type": ["null", "string"] + }, + "sourceId": { + "type": ["null", "string"] + } + } + }, + "domain": { + "type": "object", + "properties": { + "value": { + "type": ["null", "string"] + }, + "timestamp": { + "type": ["null", "string"], + "format": "date-time" + }, + "source": { + "type": ["null", "string"] + }, + "sourceId": { + "type": ["null", "string"] + } + } + }, + "website": { + "type": "object", + "properties": { + "value": { + "type": ["null", "string"] + }, + "timestamp": { + "type": ["null", "string"], + "format": "date-time" + }, + "source": { + "type": ["null", "string"] + }, + "sourceId": { + "type": ["null", "string"] + } + } + }, + "createdate": { + "type": "object", + "properties": { + "value": { + "type": ["null", "string"], + "format": "date-time" + }, + "timestamp": { + "type": ["null", "string"], + "format": "date-time" + }, + "source": { + "type": ["null", "string"] + }, + "sourceId": { + "type": ["null", "string"] + } + } + }, + "hs_lastmodifieddate": { + "type": "object", + "properties": { + "value": { + "type": ["null", "string"], + "format": "date-time" + }, + "timestamp": { + "type": ["null", "string"], + "format": "date-time" + }, + "source": { + "type": ["null", "string"] + }, + "sourceId": { + "type": ["null", "string"] + } + } + } + } } } } diff --git a/tap_hubspot/schemas/contacts.json b/tap_hubspot/schemas/contacts.json index f61a97b5..54e4d3d2 100644 --- a/tap_hubspot/schemas/contacts.json +++ b/tap_hubspot/schemas/contacts.json @@ -4,190 +4,40 @@ "vid": { "type": ["null", "integer"] }, - "canonical-vid": { - "type": ["null", "integer"] - }, - "merged-vids": { - "type": ["null", "array"], - "items": { - "type": ["null", "integer"] - } - }, - "portal-id": { - "type": ["null", "integer"] - }, - "is-contact": { - "type": ["null", "boolean"] - }, - "profile-token": { - "type": ["null", "string"] - }, - "profile-url": { - "type": ["null", "string"] - }, - "associated-company": { - "type": ["null", "object"], - "properties": {} - }, - "identity-profiles": { - "type": ["null", "array"], - "items": { - "type": ["null", "object"], - "properties": { - "deleted-changed-timestamp": { - "type": ["null", "string"], - "format": "date-time" - }, - "saved-at-timestamp": { - "type": ["null", "string"], - "format": "date-time" - }, - "vid": { - "type": ["null", "integer"] - }, - "identities": { - "type": ["null", "array"], - "items": { - "type": ["null", "object"], - "properties": { - "timestamp": { - "type": ["null", "string"], - "format": "date-time" - }, - "type": { - "type": ["null", "string"] - }, - "value": { - "type": ["null", "string"] - } - } + "properties": { + "type": "object", + "properties": { + "email": { + "type": "object", + "properties": { + "value": { + "type": ["null", "string"] } } - } - } - }, - "list-memberships": { - "type": ["null", "array"], - "items": { - "type": ["null", "object"], - "properties": { - "internal-list-id": { - "type": ["null", "integer"] - }, - "is-member": { - "type": ["null", "boolean"] - }, - "static-list-id": { - "type": ["null", "integer"] - }, - "timestamp": { - "type": ["null", "string"], - "format": "date-time" - }, - "vid": { - "type": ["null", "integer"] - } - } - } - }, - "form-submissions": { - "type": ["null", "array"], - "items": { - "type": ["null", "object"], - "properties": { - "conversion-id": { - "type": ["null", "string"] - }, - "timestamp": { - "type": ["null", "string"], - "format": "date-time" - }, - "form-id": { - "type": ["null", "string"] - }, - "portal-id": { - "type": ["null", "integer"] - }, - "page-url": { - "type": ["null", "string"] - }, - "title": { - "type": ["null", "string"] + }, + "createdate": { + "type": "object", + "properties": { + "value": { + "type": ["null", "string"], + "format": "date-time" + } } - } - } - }, - "merge-audits": { - "type": ["null", "array"], - "items": { - "type": ["null", "object"], - "properties": { - "canonical-vid": { - "type": ["null", "integer"] - }, - "vid-to-merge": { - "type": ["null", "integer"] - }, - "timestamp": { - "type": ["null", "string"], - "format": "date-time" - }, - "user-id": { - "type": ["null", "integer"] - }, - "num-properties-moved": { - "type": ["null", "integer"] - }, - "merged_from_email": { - "type": ["null", "object"], - "properties": { - "value": { - "type": ["null", "string"] - }, - "source-type": { - "type": ["null", "string"] - }, - "source-id": { - "type": ["null", "string"] - }, - "source-label": { - "type": ["null", "string"] - }, - "source-vids": { - "type": ["null", "array"], - "items": { - "type": ["null", "integer"] - } - }, - "timestamp": { - "type": ["null", "integer"] - }, - "selected": { - "type": ["null", "boolean"] - } + }, + "lastmodifieddate": { + "type": "object", + "properties": { + "value": { + "type": ["null", "string"], + "format": "date-time" } - }, - "merged_to_email": { - "type": ["null", "object"], - "properties": { - "value": { - "type": ["null", "string"] - }, - "source-type": { - "type": ["null", "string"] - }, - "source-id": { - "type": ["null", "string"] - }, - "source-label": { - "type": ["null", "string"] - }, - "timestamp": { - "type": ["null", "integer"] - }, - "selected": { - "type": ["null", "boolean"] - } + } + }, + "associatedcompanyid": { + "type": "object", + "properties": { + "value": { + "type": ["null", "number"] } } } diff --git a/tap_hubspot/schemas/deals.json b/tap_hubspot/schemas/deals.json index 60d3cc9c..1696df9c 100644 --- a/tap_hubspot/schemas/deals.json +++ b/tap_hubspot/schemas/deals.json @@ -29,6 +29,103 @@ } } } + }, + "properties": { + "type": "object", + "properties": { + "closedate": { + "type": "object", + "properties": { + "value": { + "type": ["null", "string"], + "format": "date-time" + }, + "timestamp": { + "type": ["null", "string"], + "format": "date-time" + }, + "source": { + "type": ["null", "string"] + }, + "sourceId": { + "type": ["null", "string"] + } + } + }, + "createdate": { + "type": "object", + "properties": { + "value": { + "type": ["null", "string"], + "format": "date-time" + }, + "timestamp": { + "type": ["null", "string"], + "format": "date-time" + }, + "source": { + "type": ["null", "string"] + }, + "sourceId": { + "type": ["null", "string"] + } + } + }, + "dealtype": { + "type": "object", + "properties": { + "value": { + "type": ["null", "string"] + }, + "timestamp": { + "type": ["null", "string"], + "format": "date-time" + }, + "source": { + "type": ["null", "string"] + }, + "sourceId": { + "type": ["null", "string"] + } + } + }, + "amount_in_home_currency": { + "type": "object", + "properties": { + "value": { + "type": ["null", "number"] + }, + "timestamp": { + "type": ["null", "string"], + "format": "date-time" + }, + "source": { + "type": ["null", "string"] + }, + "sourceId": { + "type": ["null", "string"] + } + } + }, + "dealstage": { + "type": "object", + "properties": { + "value": { + "type": ["null", "string"] + }, + "timestamp": { + "type": ["null", "string"], + "format": "date-time" + }, + "source": { + "type": ["null", "string"] + }, + "sourceId": { + "type": ["null", "string"] + } + } + } + } } } } diff --git a/tap_hubspot/schemas/email_events.json b/tap_hubspot/schemas/email_events.json index e74aa07d..06da548c 100644 --- a/tap_hubspot/schemas/email_events.json +++ b/tap_hubspot/schemas/email_events.json @@ -1,110 +1,19 @@ { "type": "object", "properties": { - "appId": { - "type": ["null", "integer"] - }, - "appName": { - "type": ["null", "string"] - }, - "browser": { - "type": ["null", "object"], - "properties": { - "family": { - "type": ["null", "string"] - }, - "name": { - "type": ["null", "string"] - }, - "producer": { - "type": ["null", "string"] - }, - "producerUrl": { - "type": ["null", "string"] - }, - "type": { - "type": ["null", "string"] - }, - "url": { - "type": ["null", "string"] - } - } - }, "created": { "type": ["null", "string"], "format": "date-time" }, - "deviceType": { - "type": ["null", "string"] - }, - "duration": { - "type": ["null", "integer"] - }, - "emailCampaignId": { - "type": ["null", "integer"] - }, - "emailCampaignGroupId": { - "type": ["null", "integer"] - }, - "filteredEvent": { - "type": ["null", "boolean"] - }, - "from": { - "type": ["null", "string"] - }, - "hmid": { - "type": ["null", "string"] - }, "id": { "type": ["null", "string"] }, "ipAddress": { "type": ["null", "string"] }, - "linkId": { - "type": ["null", "integer"] - }, - "location": { - "type": ["null", "object"], - "properties": { - "city": { - "type": ["null", "string"] - }, - "country": { - "type": ["null", "string"] - }, - "state": { - "type": ["null", "string"] - } - } - }, - "portalId": { - "type": ["null", "integer"] - }, "recipient": { "type": ["null", "string"] }, - "response": { - "type": ["null", "string"] - }, - "sentBy": { - "type": ["null", "object"], - "properties": { - "created": { - "type": ["null", "string"], - "format": "date-time" - }, - "id": { - "type": ["null", "string"] - } - } - }, - "smtpId": { - "type": ["null", "string"] - }, - "subject": { - "type": ["null", "string"] - }, "type": { "type": ["null", "string"] }, From 69241e060dff0c5c19bf9d7a9af0cb38f6852893 Mon Sep 17 00:00:00 2001 From: JingLin0 Date: Wed, 19 Feb 2020 00:16:07 +0100 Subject: [PATCH 33/78] delete unused catalogs --- tap_hubspot/schemas/campaigns.json | 91 ----------------- tap_hubspot/schemas/contact_lists.json | 97 ------------------- tap_hubspot/schemas/contacts_by_company.json | 12 --- tap_hubspot/schemas/owners.json | 72 -------------- tap_hubspot/schemas/subscription_changes.json | 54 ----------- tap_hubspot/schemas/workflows.json | 48 --------- 6 files changed, 374 deletions(-) delete mode 100644 tap_hubspot/schemas/campaigns.json delete mode 100644 tap_hubspot/schemas/contact_lists.json delete mode 100644 tap_hubspot/schemas/contacts_by_company.json delete mode 100644 tap_hubspot/schemas/owners.json delete mode 100644 tap_hubspot/schemas/subscription_changes.json delete mode 100644 tap_hubspot/schemas/workflows.json diff --git a/tap_hubspot/schemas/campaigns.json b/tap_hubspot/schemas/campaigns.json deleted file mode 100644 index 29797daa..00000000 --- a/tap_hubspot/schemas/campaigns.json +++ /dev/null @@ -1,91 +0,0 @@ -{ - "type": "object", - "properties": { - "appId": { - "type": ["null", "integer"] - }, - "appName": { - "type": ["null", "string"] - }, - "contentId": { - "type": ["null", "integer"] - }, - "counters": { - "type": ["null", "object"], - "properties": { - "delievered": { - "type": ["null", "integer"] - }, - "open": { - "type": ["null", "integer"] - }, - "processed": { - "type": ["null", "integer"] - }, - "sent": { - "type": ["null", "integer"] - }, - "deferred": { - "type": ["null", "integer"] - }, - "unsubscribed": { - "type": ["null", "integer"] - }, - "statuschange": { - "type": ["null", "integer"] - }, - "bounce": { - "type": ["null", "integer"] - }, - "mta_dropped": { - "type": ["null", "integer"] - }, - "dropped": { - "type": ["null", "integer"] - }, - "suppressed": { - "type": ["null", "integer"] - }, - "click": { - "type": ["null", "integer"] - }, - "delivered": { - "type": ["null", "integer"] - }, - "forward": { - "type": ["null", "integer"] - }, - "print": { - "type": ["null", "integer"] - }, - "reply": { - "type": ["null", "integer"] - }, - "spamreport": { - "type": ["null", "integer"] - } - } - }, - "id": { - "type": ["null", "integer"] - }, - "name": { - "type": ["null", "string"] - }, - "numIncluded": { - "type": ["null", "integer"] - }, - "numQueued": { - "type": ["null", "integer"] - }, - "subType": { - "type": ["null", "string"] - }, - "subject": { - "type": ["null", "string"] - }, - "type": { - "type": ["null", "string"] - } - } -} diff --git a/tap_hubspot/schemas/contact_lists.json b/tap_hubspot/schemas/contact_lists.json deleted file mode 100644 index d3ad2ae6..00000000 --- a/tap_hubspot/schemas/contact_lists.json +++ /dev/null @@ -1,97 +0,0 @@ -{ - "type": "object", - "properties": { - "parentId": { - "type": ["null", "integer"] - }, - "metaData": { - "type": "object", - "properties": { - "processing": { - "type": ["null", "string"] - }, - "size": { - "type": ["null", "integer"] - }, - "error": { - "type": ["null", "string"] - }, - "lastProcessingStateChangeAt": { - "type": ["null", "string"], - "format": "date-time" - }, - "lastSizeChangeAt": { - "type": ["null", "string"], - "format": "date-time" - } - } - }, - "dynamic": { - "type": ["null", "boolean"] - }, - "name": { - "type": ["null", "string"] - }, - "filters": { - "type": "array", - "items": { - "type": "array", - "items": { - "type": "object", - "properties": { - "filterFamily": { - "type": ["null", "string"] - }, - "withinTimeMode": { - "type": ["null", "string"] - }, - "checkPastVersions": { - "type": ["null", "boolean"] - }, - "type": { - "type": ["null", "string"] - }, - "property": { - "type": ["null", "string"] - }, - "value": { - "type": ["null", "string"] - }, - "operator": { - "type": ["null", "string"] - } - } - } - } - }, - "portalId": { - "type": ["null", "integer"] - }, - "createdAt": { - "type": ["null", "string"], - "format": "date-time" - }, - "listId": { - "type": ["null", "integer"] - }, - "updatedAt": { - "type": ["null", "string"], - "format": "date-time" - }, - "internalListId": { - "type": ["null", "integer"] - }, - "readOnly": { - "type": ["null", "boolean"] - }, - "deleteable": { - "type": ["null", "boolean"] - }, - "listType": { - "type": ["null", "string"] - }, - "archived": { - "type": ["null", "boolean"] - } - } -} diff --git a/tap_hubspot/schemas/contacts_by_company.json b/tap_hubspot/schemas/contacts_by_company.json deleted file mode 100644 index 22e7ffbb..00000000 --- a/tap_hubspot/schemas/contacts_by_company.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "type": "object", - "properties": { - "contact_id": { - "type": ["integer"] - }, - "company_id": { - "type": ["integer"] - } - }, - "additionalProperties": false -} diff --git a/tap_hubspot/schemas/owners.json b/tap_hubspot/schemas/owners.json deleted file mode 100644 index 2e3d61d0..00000000 --- a/tap_hubspot/schemas/owners.json +++ /dev/null @@ -1,72 +0,0 @@ -{ - "type": "object", - "properties": { - "portalId": { - "type": ["null", "integer"] - }, - "ownerId": { - "type": ["null", "integer"] - }, - "type": { - "type": ["null", "string"] - }, - "firstName": { - "type": ["null", "string"] - }, - "lastName": { - "type": ["null", "string"] - }, - "email": { - "type": ["null", "string"] - }, - "createdAt": { - "type": ["null", "string"], - "format": "date-time" - }, - "signature": { - "type": ["null", "string"] - }, - "updatedAt": { - "type": ["null", "string"], - "format": "date-time" - }, - "hasContactsAccess" : { - "type": ["null", "boolean"] - }, - "isActive": { - "type": ["null", "boolean"] - }, - "activeUserId" : { - "type": ["null", "integer"] - }, - "userIdIncludingInactive" : { - "type": ["null", "integer"] - }, - "remoteList": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": ["null", "integer"] - }, - "portalId": { - "type": ["null", "integer"] - }, - "ownerId": { - "type": ["null", "integer"] - }, - "remoteId": { - "type": ["null", "string"] - }, - "remoteType": { - "type": ["null", "string"] - }, - "active": { - "type": ["null", "boolean"] - } - } - } - } - } -} diff --git a/tap_hubspot/schemas/subscription_changes.json b/tap_hubspot/schemas/subscription_changes.json deleted file mode 100644 index 1db687d2..00000000 --- a/tap_hubspot/schemas/subscription_changes.json +++ /dev/null @@ -1,54 +0,0 @@ -{ - "type": "object", - "properties": { - "timestamp": { - "type": ["null", "string"], - "format": "date-time" - }, - "portalId": { - "type": ["null", "integer"] - }, - "recipient": { - "type": ["null", "string"] - }, - "changes": { - "type": ["null", "array"], - "items": { - "type": ["null", "object"], - "properties": { - "change": { - "type": ["null", "string"] - }, - "timestamp": { - "type": ["null", "string"], - "format": "date-time" - }, - "source": { - "type": ["null", "string"] - }, - "portalId": { - "type": ["null", "integer"] - }, - "subscriptionId": { - "type": ["null", "integer"] - }, - "changeType": { - "type": ["null", "string"] - }, - "causedByEvent": { - "type": ["null", "object"], - "properties": { - "id": { - "type": ["null", "string"] - }, - "created": { - "type": ["null", "string"], - "format": "date-time" - } - } - } - } - } - } - } -} diff --git a/tap_hubspot/schemas/workflows.json b/tap_hubspot/schemas/workflows.json deleted file mode 100644 index a72491ae..00000000 --- a/tap_hubspot/schemas/workflows.json +++ /dev/null @@ -1,48 +0,0 @@ -{ - "type": "object", - "properties": { - "name": { - "type": ["null", "string"] - }, - "id": { - "type": ["null", "integer"] - }, - "type": { - "type": ["null", "string"] - }, - "enabled": { - "type": ["null", "boolean"] - }, - "insertedAt": { - "type": ["null", "string"], - "format": "date-time" - }, - "updatedAt": { - "type": ["null", "string"], - "format": "date-time" - }, - "personaTagIds": { - "type": "array", - "items": { - "type": "integer" - } - }, - "contactListIds": { - "type": "object", - "properties": { - "enrolled": { - "type": ["null", "integer"] - }, - "active": { - "type": ["null", "integer"] - }, - "steps": { - "type": ["null", "array"], - "items": { - "type": ["null", "string"] - } - } - } - } - } -} From 721b64b090dfc1a315155870311cf8c1a18f3a99 Mon Sep 17 00:00:00 2001 From: JingLin0 Date: Wed, 19 Feb 2020 00:52:27 +0100 Subject: [PATCH 34/78] delete custom schema update --- tap_hubspot/__init__.py | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/tap_hubspot/__init__.py b/tap_hubspot/__init__.py index 581a243f..9c5b9706 100644 --- a/tap_hubspot/__init__.py +++ b/tap_hubspot/__init__.py @@ -210,40 +210,12 @@ def parse_custom_schema(entity_name, data): } -def get_custom_schema(entity_name): - return parse_custom_schema( - entity_name, request(get_url(entity_name + "_properties")).json() - ) - - def get_abs_path(path): return os.path.join(os.path.dirname(os.path.realpath(__file__)), path) -def load_associated_company_schema(): - associated_company_schema = load_schema("companies") - # pylint: disable=line-too-long - associated_company_schema["properties"]["company-id"] = associated_company_schema[ - "properties" - ].pop("companyId") - associated_company_schema["properties"]["portal-id"] = associated_company_schema[ - "properties" - ].pop("portalId") - return associated_company_schema - - def load_schema(entity_name): schema = utils.load_json(get_abs_path("schemas/{}.json".format(entity_name))) - if entity_name in ["contacts", "companies", "deals"]: - custom_schema = get_custom_schema(entity_name) - schema["properties"]["properties"] = { - "type": "object", - "properties": custom_schema, - } - - if entity_name == "contacts": - schema["properties"]["associated-company"] = load_associated_company_schema() - return schema_nodash(schema) From 677810f7e94441147b55010e3e5f6523dbae3834 Mon Sep 17 00:00:00 2001 From: JingLin0 Date: Wed, 19 Feb 2020 01:03:35 +0100 Subject: [PATCH 35/78] delete unused functions --- tap_hubspot/__init__.py | 201 ---------------------------------------- 1 file changed, 201 deletions(-) diff --git a/tap_hubspot/__init__.py b/tap_hubspot/__init__.py index 9c5b9706..84982042 100644 --- a/tap_hubspot/__init__.py +++ b/tap_hubspot/__init__.py @@ -510,30 +510,6 @@ def use_recent_companies_endpoint(response): return response["total"] < 10000 -default_contacts_by_company_params = {"count": 250} - -# NB> to do: support stream aliasing and field selection -def _sync_contacts_by_company(STATE, ctx, company_id): - catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) - schema = catalog.get["schema"] - mdata = metadata.to_map(catalog.get("metadata")) - url = get_url("contacts_by_company", company_id=company_id) - path = "vids" - with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: - with metrics.record_counter(CONTACTS_BY_COMPANY) as counter: - data = request(url, default_contacts_by_company_params).json() - for row in data[path]: - counter.increment() - row = replace_na_with_none(row) - record = {"company_id": company_id, "contact_id": row} - record = bumble_bee.transform(record, schema, mdata) - singer.write_record( - "contacts_by_company", record, time_extracted=utils.now() - ) - - return STATE - - default_company_params = { "limit": 250, "properties": [ @@ -571,13 +547,6 @@ def sync_companies(STATE, ctx): url = get_url("companies_all") max_bk_value = start - if CONTACTS_BY_COMPANY in ctx.selected_stream_ids: - contacts_by_company_schema = load_schema(CONTACTS_BY_COMPANY) - singer.write_schema( - "contacts_by_company", - contacts_by_company_schema, - ["company_id", "contact_id"], - ) with bumble_bee: for row in gen_request( @@ -615,8 +584,6 @@ def sync_companies(STATE, ctx): catalog.get("stream_alias"), time_extracted=utils.now(), ) - if CONTACTS_BY_COMPANY in ctx.selected_stream_ids: - STATE = _sync_contacts_by_company(STATE, ctx, record["companyId"]) # Don't bookmark past the start of this sync to account for updated records during the sync. new_bookmark = min(max_bk_value, current_sync_start) STATE = singer.write_bookmark( @@ -803,70 +770,12 @@ def sync_entity_chunked(STATE, catalog, entity_name, key_properties, path): return STATE -def sync_subscription_changes(STATE, ctx): - catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) - STATE = sync_entity_chunked( - STATE, - catalog, - "subscription_changes", - ["timestamp", "portalId", "recipient"], - "timeline", - ) - return STATE - - def sync_email_events(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) STATE = sync_entity_chunked(STATE, catalog, "email_events", ["id"], "events") return STATE -def sync_contact_lists(STATE, ctx): - catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) - mdata = metadata.to_map(catalog.get("metadata")) - schema = load_schema("contact_lists") - bookmark_key = "updatedAt" - singer.write_schema( - "contact_lists", schema, ["listId"], [bookmark_key], catalog.get("stream_alias") - ) - - start = get_start(STATE, "contact_lists", bookmark_key) - max_bk_value = start - - LOGGER.info("sync_contact_lists from %s", start) - - url = get_url("contact_lists") - params = {"count": 250} - with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: - for row in gen_request( - STATE, - "contact_lists", - url, - params, - "lists", - "has-more", - ["offset"], - ["offset"], - ): - row = replace_na_with_none(row) - record = bumble_bee.transform(row, schema, mdata) - - if record[bookmark_key] >= start: - singer.write_record( - "contact_lists", - record, - catalog.get("stream_alias"), - time_extracted=utils.now(), - ) - if record[bookmark_key] >= max_bk_value: - max_bk_value = record[bookmark_key] - - STATE = singer.write_bookmark(STATE, "contact_lists", bookmark_key, max_bk_value) - singer.write_state(STATE) - - return STATE - - def sync_forms(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) mdata = metadata.to_map(catalog.get("metadata")) @@ -905,85 +814,6 @@ def sync_forms(STATE, ctx): return STATE -def sync_workflows(STATE, ctx): - catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) - mdata = metadata.to_map(catalog.get("metadata")) - schema = catalog["schema"] - bookmark_key = "updatedAt" - singer.write_schema( - "workflows", schema, ["id"], [bookmark_key], catalog.get("stream_alias") - ) - start = get_start(STATE, "workflows", bookmark_key) - max_bk_value = start - - STATE = singer.write_bookmark(STATE, "workflows", bookmark_key, max_bk_value) - singer.write_state(STATE) - - LOGGER.info("sync_workflows from %s", start) - - data = request(get_url("workflows")).json() - time_extracted = utils.now() - - with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: - for row in data["workflows"]: - row = replace_na_with_none(row) - record = bumble_bee.transform(row, schema, mdata) - if record[bookmark_key] >= start: - singer.write_record( - "workflows", - record, - catalog.get("stream_alias"), - time_extracted=time_extracted, - ) - if record[bookmark_key] >= max_bk_value: - max_bk_value = record[bookmark_key] - - STATE = singer.write_bookmark(STATE, "workflows", bookmark_key, max_bk_value) - singer.write_state(STATE) - return STATE - - -def sync_owners(STATE, ctx): - catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) - mdata = metadata.to_map(catalog.get("metadata")) - schema = catalog["schema"] - bookmark_key = "updatedAt" - - singer.write_schema( - "owners", schema, ["ownerId"], [bookmark_key], catalog.get("stream_alias") - ) - start = get_start(STATE, "owners", bookmark_key) - max_bk_value = start - - LOGGER.info("sync_owners from %s", start) - - params = {} - if CONFIG.get("include_inactives"): - params["includeInactives"] = "true" - data = request(get_url("owners"), params).json() - - time_extracted = utils.now() - - with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: - for row in data: - row = replace_na_with_none(row) - record = bumble_bee.transform(row, schema, mdata) - if record[bookmark_key] >= max_bk_value: - max_bk_value = record[bookmark_key] - - if record[bookmark_key] >= start: - singer.write_record( - "owners", - record, - catalog.get("stream_alias"), - time_extracted=time_extracted, - ) - - STATE = singer.write_bookmark(STATE, "owners", bookmark_key, max_bk_value) - singer.write_state(STATE) - return STATE - - def sync_engagements(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) mdata = metadata.to_map(catalog.get("metadata")) @@ -1090,21 +920,9 @@ class Stream(object): STREAMS = [ - # Do these first as they are incremental - Stream( - "subscription_changes", - sync_subscription_changes, - ["timestamp", "portalId", "recipient"], - "startTimestamp", - "INCREMENTAL", - ), Stream("email_events", sync_email_events, ["id"], "startTimestamp", "INCREMENTAL"), # Do these last as they are full table Stream("forms", sync_forms, ["guid"], "updatedAt", "FULL_TABLE"), - Stream("workflows", sync_workflows, ["id"], "updatedAt", "FULL_TABLE"), - Stream("owners", sync_owners, ["ownerId"], "updatedAt", "FULL_TABLE"), - Stream("campaigns", sync_campaigns, ["id"], None, "FULL_TABLE"), - Stream("contact_lists", sync_contact_lists, ["listId"], "updatedAt", "FULL_TABLE"), Stream("contacts", sync_contacts, ["vid"], "versionTimestamp", "FULL_TABLE"), Stream( "companies", sync_companies, ["companyId"], "hs_lastmodifieddate", "FULL_TABLE" @@ -1252,25 +1070,6 @@ def discover_schemas(): "metadata": mdata, } ) - # Load the contacts_by_company schema - LOGGER.info("Loading schema for contacts_by_company") - contacts_by_company = Stream( - "contacts_by_company", - _sync_contacts_by_company, - ["company-id", "contact-id"], - None, - "FULL_TABLE", - ) - schema, mdata = load_discovered_schema(contacts_by_company) - - result["streams"].append( - { - "stream": CONTACTS_BY_COMPANY, - "tap_stream_id": CONTACTS_BY_COMPANY, - "schema": schema, - "metadata": mdata, - } - ) return result From 6a75d2596e7db3fa3b9d9333156cb730972bb9a9 Mon Sep 17 00:00:00 2001 From: JingLin0 Date: Wed, 19 Feb 2020 09:54:56 +0100 Subject: [PATCH 36/78] add missed field --- tap_hubspot/schemas/deals.json | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tap_hubspot/schemas/deals.json b/tap_hubspot/schemas/deals.json index 1696df9c..597eeb4f 100644 --- a/tap_hubspot/schemas/deals.json +++ b/tap_hubspot/schemas/deals.json @@ -33,6 +33,24 @@ "properties": { "type": "object", "properties": { + "pipeline": { + "type": "object", + "properties": { + "value": { + "type": ["null", "string"] + }, + "timestamp": { + "type": ["null", "string"], + "format": "date-time" + }, + "source": { + "type": ["null", "string"] + }, + "sourceId": { + "type": ["null", "string"] + } + } + }, "closedate": { "type": "object", "properties": { From 87ccbe14c80d4f6c1c6ecc38a16147b782195a90 Mon Sep 17 00:00:00 2001 From: JingLin0 Date: Fri, 28 Feb 2020 12:30:12 +0100 Subject: [PATCH 37/78] do not show get request --- tap_hubspot/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tap_hubspot/__init__.py b/tap_hubspot/__init__.py index 84982042..125448ac 100644 --- a/tap_hubspot/__init__.py +++ b/tap_hubspot/__init__.py @@ -346,7 +346,7 @@ def request(url, params=None): headers["User-Agent"] = CONFIG["user_agent"] req = requests.Request("GET", url, params=params, headers=headers).prepare() - LOGGER.info("GET %s", req.url) + #LOGGER.info("GET %s", req.url) with metrics.http_request_timer(parse_source_from_url(url)) as timer: resp = SESSION.send(req) timer.tags[metrics.Tag.http_status_code] = resp.status_code From 332b59c157cc5bbf4e8b5aec8cfe3a088c75a700 Mon Sep 17 00:00:00 2001 From: JingLin0 Date: Sun, 15 Mar 2020 18:39:58 +0100 Subject: [PATCH 38/78] add hs_lastmodifieddate field --- tap_hubspot/schemas/deals.json | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tap_hubspot/schemas/deals.json b/tap_hubspot/schemas/deals.json index 597eeb4f..cd8e9be6 100644 --- a/tap_hubspot/schemas/deals.json +++ b/tap_hubspot/schemas/deals.json @@ -51,6 +51,25 @@ } } }, + "hs_lastmodifieddate": { + "type": "object", + "properties": { + "value": { + "type": ["null", "string"], + "format": "date-time" + }, + "timestamp": { + "type": ["null", "string"], + "format": "date-time" + }, + "source": { + "type": ["null", "string"] + }, + "sourceId": { + "type": ["null", "string"] + } + } + }, "closedate": { "type": "object", "properties": { From 24dc42b5f0874f1b27896abb956ec2a5e76566eb Mon Sep 17 00:00:00 2001 From: JingLin0 Date: Sun, 15 Mar 2020 23:09:53 +0100 Subject: [PATCH 39/78] sync table companies incrementally --- tap_hubspot/__init__.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tap_hubspot/__init__.py b/tap_hubspot/__init__.py index 125448ac..8a72d4f2 100644 --- a/tap_hubspot/__init__.py +++ b/tap_hubspot/__init__.py @@ -576,14 +576,14 @@ def sync_companies(STATE, ctx): if modified_time and modified_time >= max_bk_value: max_bk_value = modified_time - - record = bumble_bee.transform(row, schema, mdata) - singer.write_record( - "companies", - record, - catalog.get("stream_alias"), - time_extracted=utils.now(), - ) + if not modified_time or modified_time >= start: + record = bumble_bee.transform(row, schema, mdata) + singer.write_record( + "companies", + record, + catalog.get("stream_alias"), + time_extracted=utils.now(), + ) # Don't bookmark past the start of this sync to account for updated records during the sync. new_bookmark = min(max_bk_value, current_sync_start) STATE = singer.write_bookmark( From edc57423e736a148fd6afd2cd0c9668a76bec9b0 Mon Sep 17 00:00:00 2001 From: JingLin0 Date: Mon, 16 Mar 2020 11:12:34 +0100 Subject: [PATCH 40/78] update deal-pipeline schema --- tap_hubspot/schemas/deal_pipelines.json | 36 ++++++++++++++++++------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/tap_hubspot/schemas/deal_pipelines.json b/tap_hubspot/schemas/deal_pipelines.json index e23a6445..423d42a0 100644 --- a/tap_hubspot/schemas/deal_pipelines.json +++ b/tap_hubspot/schemas/deal_pipelines.json @@ -4,6 +4,20 @@ "pipelineId": { "type": ["null", "string"] }, + "createdAt": { + "type": ["null", "string"], + "format": "date-time" + }, + "updatedAt": { + "type": ["null", "string"], + "format": "date-time" + }, + "objectType": { + "type": ["null", "string"] + }, + "objectTypeId": { + "type": ["null", "string"] + }, "stages": { "type": ["null", "array"], "items": { @@ -15,20 +29,22 @@ "label": { "type": ["null", "string"] }, - "probability": { - "type": ["null", "number"] - }, - "active": { - "type": ["null", "boolean"] - }, "displayOrder": { "type": ["null", "integer"] }, - "closedWon": { - "type": ["null", "boolean"] + "metadata": { + "type": "object", + "properties": { + "isClosed": { + "type": "string" + }, + "probability": { + "type": "string" + } + } } } - } + } }, "label": { "type": ["null", "string"] @@ -40,7 +56,7 @@ "type": ["null", "integer"] }, "staticDefault": { - "type": ["null", "boolean"] + "type": ["null", "boolean"] } } } From 2969fccb6a8d6ec31674f17b01f212b10eb050bb Mon Sep 17 00:00:00 2001 From: JingLin0 Date: Mon, 16 Mar 2020 11:13:24 +0100 Subject: [PATCH 41/78] /deals/v1/pipelines endpoint is deprecated --- tap_hubspot/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tap_hubspot/__init__.py b/tap_hubspot/__init__.py index 8a72d4f2..bb51f2a2 100644 --- a/tap_hubspot/__init__.py +++ b/tap_hubspot/__init__.py @@ -82,7 +82,7 @@ class StateFields: "deals_all": "/deals/v1/deal/paged", "deals_recent": "/deals/v1/deal/recent/modified", "deals_detail": "/deals/v1/deal/{deal_id}", - "deal_pipelines": "/deals/v1/pipelines", + "deal_pipelines": "/crm-pipelines/v1/pipelines/deals", "campaigns_all": "/email/public/v1/campaigns/by-id", "campaigns_detail": "/email/public/v1/campaigns/{campaign_id}", "engagements_all": "/engagements/v1/engagements/paged", From 2fd365762a44c7317d2d2f95a6dec8195df73823 Mon Sep 17 00:00:00 2001 From: JingLin0 Date: Mon, 16 Mar 2020 11:14:59 +0100 Subject: [PATCH 42/78] sync deal-pipeline incrementally with new replication key --- tap_hubspot/__init__.py | 40 +++++++++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/tap_hubspot/__init__.py b/tap_hubspot/__init__.py index bb51f2a2..42bb4eea 100644 --- a/tap_hubspot/__init__.py +++ b/tap_hubspot/__init__.py @@ -888,24 +888,44 @@ def sync_engagements(STATE, ctx): def sync_deal_pipelines(STATE, ctx): + bookmark_key = "updatedAt" + start = utils.strptime_with_tz(get_start(STATE, "deal_pipelines", bookmark_key)) + max_bk_value = start catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) mdata = metadata.to_map(catalog.get("metadata")) schema = catalog["schema"] singer.write_schema( "deal_pipelines", schema, ["pipelineId"], catalog.get("stream_alias") ) - LOGGER.info("sync_deal_pipelines") + LOGGER.info(f"sync deal_pipelines from {start}") + data = request(get_url("deal_pipelines")).json() with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: - for row in data: + for row in data["results"]: + modified_time = None row = replace_na_with_none(row) - record = bumble_bee.transform(row, schema, mdata) - singer.write_record( - "deal_pipelines", - record, - catalog.get("stream_alias"), - time_extracted=utils.now(), + if bookmark_key in row: + timestamp_millis = row[bookmark_key] + elif "createdAt" in row: + # Hubspot returns timestamps in millis + timestamp_millis = row["createdAt"] + modified_time = datetime.datetime.fromtimestamp( + timestamp_millis / 1000.0, datetime.timezone.utc ) + if modified_time and modified_time >= max_bk_value: + max_bk_value = modified_time + + if not modified_time or modified_time > start: + record = bumble_bee.transform(row, schema, mdata) + singer.write_record( + "deal_pipelines", + record, + catalog.get("stream_alias"), + time_extracted=utils.now(), + ) + STATE = singer.write_bookmark( + STATE, "deal_pipelines", bookmark_key, utils.strftime(max_bk_value) + ) singer.write_state(STATE) return STATE @@ -928,7 +948,9 @@ class Stream(object): "companies", sync_companies, ["companyId"], "hs_lastmodifieddate", "FULL_TABLE" ), Stream("deals", sync_deals, ["dealId"], "hs_lastmodifieddate", "FULL_TABLE"), - Stream("deal_pipelines", sync_deal_pipelines, ["pipelineId"], None, "FULL_TABLE"), + Stream( + "deal_pipelines", sync_deal_pipelines, ["pipelineId"], "updatedAt", "FULL_TABLE" + ), Stream( "engagements", sync_engagements, ["engagement_id"], "lastUpdated", "FULL_TABLE" ), From b96846552a324f093f7c4937c89d341cd387da51 Mon Sep 17 00:00:00 2001 From: JingLin0 Date: Mon, 16 Mar 2020 11:15:17 +0100 Subject: [PATCH 43/78] avoid repetition of data --- tap_hubspot/__init__.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tap_hubspot/__init__.py b/tap_hubspot/__init__.py index 42bb4eea..392e637c 100644 --- a/tap_hubspot/__init__.py +++ b/tap_hubspot/__init__.py @@ -481,7 +481,7 @@ def sync_contacts(STATE, ctx): ) ) - if not modified_time or modified_time >= start: + if not modified_time or modified_time > start: vids.append(row["vid"]) if modified_time and modified_time >= max_bk_value: @@ -576,7 +576,7 @@ def sync_companies(STATE, ctx): if modified_time and modified_time >= max_bk_value: max_bk_value = modified_time - if not modified_time or modified_time >= start: + if not modified_time or modified_time > start: record = bumble_bee.transform(row, schema, mdata) singer.write_record( "companies", @@ -646,7 +646,7 @@ def sync_deals(STATE, ctx): if modified_time and modified_time >= max_bk_value: max_bk_value = modified_time - if not modified_time or modified_time >= start: + if not modified_time or modified_time > start: record = bumble_bee.transform(row, schema, mdata) singer.write_record( "deals", @@ -798,7 +798,7 @@ def sync_forms(STATE, ctx): row = replace_na_with_none(row) record = bumble_bee.transform(row, schema, mdata) - if record[bookmark_key] >= start: + if record[bookmark_key] > start: singer.write_record( "forms", record, @@ -864,7 +864,7 @@ def sync_engagements(STATE, ctx): for engagement in engagements: engagement = replace_na_with_none(engagement) record = bumble_bee.transform(engagement, schema, mdata) - if record["engagement"][bookmark_key] >= start: + if record["engagement"][bookmark_key] > start: # hoist PK and bookmark field to top-level record record["engagement_id"] = record["engagement"]["id"] record[bookmark_key] = record["engagement"][bookmark_key] From 3b3fb85861863ed1efb8326b2f340fdcbd94b1d9 Mon Sep 17 00:00:00 2001 From: JingLin0 Date: Tue, 14 Apr 2020 00:45:45 +0200 Subject: [PATCH 44/78] delete test files --- tap_hubspot/tests/__init__.py | 0 tap_hubspot/tests/test_bookmarks.py | 68 ----------------- tap_hubspot/tests/test_deals.py | 47 ------------ tap_hubspot/tests/test_get_streams_to_sync.py | 50 ------------- tap_hubspot/tests/test_offsets.py | 61 --------------- tap_hubspot/tests/utils.py | 74 ------------------- 6 files changed, 300 deletions(-) delete mode 100644 tap_hubspot/tests/__init__.py delete mode 100644 tap_hubspot/tests/test_bookmarks.py delete mode 100644 tap_hubspot/tests/test_deals.py delete mode 100644 tap_hubspot/tests/test_get_streams_to_sync.py delete mode 100644 tap_hubspot/tests/test_offsets.py delete mode 100644 tap_hubspot/tests/utils.py diff --git a/tap_hubspot/tests/__init__.py b/tap_hubspot/tests/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tap_hubspot/tests/test_bookmarks.py b/tap_hubspot/tests/test_bookmarks.py deleted file mode 100644 index 387a0616..00000000 --- a/tap_hubspot/tests/test_bookmarks.py +++ /dev/null @@ -1,68 +0,0 @@ -import unittest -import singer.bookmarks -import singer.messages -import tap_hubspot -import pprint -import os -from tap_hubspot.tests import utils - -LOGGER = singer.get_logger() - -class Bookmarks(unittest.TestCase): - def setUp(self): - utils.verify_environment_vars() - utils.seed_tap_hubspot_config() - singer.write_bookmark = utils.our_write_bookmark - singer.write_state = utils.our_write_state - singer.write_record = utils.our_write_record - singer.write_schema = utils.our_write_schema - - #NB> test account must have > 2 contacts for this to work - def sync_contacts(self): - STATE = utils.get_clear_state() - catalog = {'stream_alias' : 'hubspot_contacts'} - - tap_hubspot.default_contact_params['count'] = 1 - - STATE = tap_hubspot.sync_contacts(STATE, catalog) - #offset has been cleared - self.assertEqual(utils.caught_state['bookmarks']['contacts']['offset'], {}) - - #some bookmark has been recorded in the state - self.assertNotEqual(utils.caught_state['bookmarks']['contacts']['lastmodifieddate'], None) - - #should sync some contacts - # LOGGER.info('A caught record: {}'.format(utils.caught_records['contacts'][0])) - self.assertGreater(len(utils.caught_records['contacts']),1) - self.assertEqual(set(utils.caught_records.keys()), {'contacts'}) - self.assertEqual(utils.caught_pks, {'contacts': ['vid']}) - - utils.caught_records = [] - STATE = tap_hubspot.sync_contacts(STATE, catalog) - - #no new records thanks to bookmark - self.assertEqual(len(utils.caught_records),0) - - def sync_companies(self): - STATE = utils.get_clear_state() - - catalog = {'stream_alias' : 'hubspot_companies'} - STATE = tap_hubspot.sync_companies(STATE, catalog) - - #offset has been cleared - self.assertEqual(utils.caught_state['bookmarks']['companies']['offset'], {}) - - #some bookmark has been recorded in the state - self.assertNotEqual(utils.caught_state['bookmarks']['companies']['hs_lastmodifieddate'], None) - - #should sync some contacts && some hubspot_contacts_by_company - self.assertGreater(len(utils.caught_records), 0) - self.assertEqual(set(utils.caught_records.keys()), {'companies', 'hubspot_contacts_by_company'}) - - self.assertEqual(utils.caught_pks, {'companies': ['companyId'], 'hubspot_contacts_by_company': ['company-id', 'contact-id']}) - - utils.caught_records = [] - STATE = tap_hubspot.sync_companies(STATE, catalog) - - #no new records thanks to bookmark - self.assertEqual(len(utils.caught_records),0) diff --git a/tap_hubspot/tests/test_deals.py b/tap_hubspot/tests/test_deals.py deleted file mode 100644 index 7dacf1c9..00000000 --- a/tap_hubspot/tests/test_deals.py +++ /dev/null @@ -1,47 +0,0 @@ -from tap_hubspot import sync_deals -import unittest -from unittest.mock import patch, ANY - - -class TestDealsToSync(unittest.TestCase): - - @patch('tap_hubspot.Context.get_catalog_from_id', return_value={"metadata":""}) - @patch('singer.metadata.to_map', return_value={}) - @patch('singer.utils.strptime_with_tz') - @patch('singer.utils.strftime') - @patch('tap_hubspot.load_schema') - @patch('tap_hubspot.gen_request', return_value=list()) - def test_associations_are_not_validated(self, - mocked_gen_request, - mocked_catalog_from_id, - mocked_metadata_map, - mocked_utils_strptime, - mocked_utils_strftime, - mocked_load_schema): - - sync_deals({}, mocked_catalog_from_id) - - expected_param = {'count': 250, 'includeAssociations': False, 'properties': []} - - mocked_gen_request.assert_called_once_with(ANY, ANY, ANY, expected_param, ANY, ANY, ANY, ANY) - - - @patch('tap_hubspot.Context.get_catalog_from_id', return_value={"metadata":""}) - @patch('singer.metadata.to_map', return_value={"associations" :{"selected" : True}}) - @patch('singer.utils.strptime_with_tz') - @patch('singer.utils.strftime') - @patch('tap_hubspot.load_schema') - @patch('tap_hubspot.gen_request', return_value=list()) - def test_associations_are_validated(self, - mocked_gen_request, - mocked_catalog_from_id, - mocked_metadata_map, - mocked_utils_strptime, - mocked_utils_strftime, - mocked_load_schema): - - sync_deals({}, mocked_catalog_from_id) - - expected_param = {'count': 250, 'includeAssociations': True, 'properties': []} - - mocked_gen_request.assert_called_once_with(ANY, ANY, ANY, expected_param, ANY, ANY, ANY, ANY) diff --git a/tap_hubspot/tests/test_get_streams_to_sync.py b/tap_hubspot/tests/test_get_streams_to_sync.py deleted file mode 100644 index 92c70c8f..00000000 --- a/tap_hubspot/tests/test_get_streams_to_sync.py +++ /dev/null @@ -1,50 +0,0 @@ -from contextlib import contextmanager -from io import StringIO -from singer import utils -from tap_hubspot import * -import time -import datetime -import json -import requests_mock -import unittest - -class TestGetStreamsToSync(unittest.TestCase): - - def setUp(self): - self.streams = [ - Stream('a', 'a', [], None, None), - Stream('b', 'b', [], None, None), - Stream('c', 'c', [], None, None), - ] - - def test_get_streams_to_sync_with_no_this_stream(self): - state = {'this_stream': None} - self.assertEqual(self.streams, get_streams_to_sync(self.streams, state)) - - def test_get_streams_to_sync_with_first_stream(self): - state = {'currently_syncing': 'a'} - - result = get_streams_to_sync(self.streams, state) - - parsed_result = [s.tap_stream_id for s in result] - self.assertEqual(parsed_result, ['a', 'b', 'c']) - - def test_get_streams_to_sync_with_middle_stream(self): - state = {'currently_syncing': 'b'} - - result = get_streams_to_sync(self.streams, state) - - parsed_result = [s.tap_stream_id for s in result] - self.assertEqual(parsed_result, ['b', 'c', 'a']) - - def test_get_streams_to_sync_with_last_stream(self): - state = {'currently_syncing': 'c'} - - result = get_streams_to_sync(self.streams, state) - - parsed_result = [s.tap_stream_id for s in result] - self.assertEqual(parsed_result, ['c', 'a', 'b']) - - def test_parse_source_from_url_succeeds(self): - url = "https://api.hubapi.com/companies/v2/companies/recent/modified" - self.assertEqual('companies', parse_source_from_url(url)) diff --git a/tap_hubspot/tests/test_offsets.py b/tap_hubspot/tests/test_offsets.py deleted file mode 100644 index 22208ff3..00000000 --- a/tap_hubspot/tests/test_offsets.py +++ /dev/null @@ -1,61 +0,0 @@ -import unittest -import logging -import singer -import tap_hubspot -import singer.bookmarks -from tap_hubspot.tests import utils - -LOGGER = singer.get_logger() - -def set_offset_with_exception(state, tap_stream_id, offset_key, offset_value): - LOGGER.info("set_offset_with_exception: {}".format(utils.caught_state)) - utils.caught_state = singer.bookmarks.set_offset(state, tap_stream_id, offset_key, offset_value) - raise Exception("simulated") - -class Offsets(unittest.TestCase): - def setUp(self): - utils.verify_environment_vars() - utils.seed_tap_hubspot_config() - singer.write_bookmark = utils.our_write_bookmark - singer.write_state = utils.our_write_state - singer.write_record = utils.our_write_record - singer.write_schema = utils.our_write_schema - singer.set_offset = set_offset_with_exception - - #NB> test accounts must have > 1 companies for this to work - def sync_companies(self): - simulated_exception = None - STATE = utils.get_clear_state() - catalog = {'stream_alias' : 'hubspot_companies'} - - #change count = 1 - tap_hubspot.default_company_params['limit'] = 1 - - try: - STATE = tap_hubspot.sync_companies(STATE, catalog) - except Exception as ex: - simulated_exception = ex - # logging.exception('strange') - - self.assertIsNot(simulated_exception, None) - - - self.assertEqual(set(utils.caught_records.keys()), {'companies', 'hubspot_contacts_by_company'}) - - #should only emit 1 company record because of the limit - self.assertEqual(len(utils.caught_records['companies']), 1) - self.assertGreater(len(utils.caught_records['hubspot_contacts_by_company']), 0) - - #offset should be set in state - LOGGER.info("utils.caught_state: {}".format(utils.caught_state)) - self.assertNotEqual(utils.caught_state['bookmarks']['companies']['offset'], {}) - - #no bookmark though - self.assertEqual(utils.caught_state['bookmarks']['companies']['hs_lastmodifieddate'], None) - - #change count back to 250 - tap_hubspot.default_company_params['limit'] = 250 - - #call do_sync and verify: - # 1)sync_companies is called first - # 2)previous retrieved record is not retrieved again diff --git a/tap_hubspot/tests/utils.py b/tap_hubspot/tests/utils.py deleted file mode 100644 index 1fe8a3c6..00000000 --- a/tap_hubspot/tests/utils.py +++ /dev/null @@ -1,74 +0,0 @@ -import singer -import singer.bookmarks -import os -import tap_hubspot - -LOGGER = singer.get_logger() - -caught_records = {} -caught_bookmarks = [] -caught_state = {} -caught_schema = {} -caught_pks = {} - - -def verify_environment_vars(): - missing_envs = [x for x in [os.getenv('TAP_HUBSPOT_REDIRECT_URI'), - os.getenv('TAP_HUBSPOT_CLIENT_ID'), - os.getenv('TAP_HUBSPOT_CLIENT_SECRET'), - os.getenv('TAP_HUBSPOT_REFRESH_TOKEN')] if x == None] - if len(missing_envs) != 0: - #pylint: disable=line-too-long - raise Exception("set TAP_HUBSPOT_REDIRECT_URI, TAP_HUBSPOT_CLIENT_ID, TAP_HUBSPOT_CLIENT_SECRET, TAP_HUBSPOT_REFRESH_TOKEN") - -def seed_tap_hubspot_config(): - tap_hubspot.CONFIG = { - "access_token": None, - "token_expires": None, - - "redirect_uri": os.environ['TAP_HUBSPOT_REDIRECT_URI'], - "client_id": os.environ['TAP_HUBSPOT_CLIENT_ID'], - "client_secret": os.environ['TAP_HUBSPOT_CLIENT_SECRET'], - "refresh_token": os.environ['TAP_HUBSPOT_REFRESH_TOKEN'], - "start_date": "2001-01-01T00:00:00Z" - } - -def get_clear_state(): - return { - "bookmarks": { - "contacts": { - "offset": {}, - "lastmodifieddate": None - }, - "companies": { - "offset": {}, - "hs_lastmodifieddate": None - } - - }, - "currently_syncing": None - } - - -#pylint: disable=line-too-long -def our_write_bookmark(state, table_name, bookmark_key, bookmark_value): - caught_bookmarks.append([bookmark_key, bookmark_value]) - state = singer.bookmarks.write_bookmark(state, table_name, bookmark_key, bookmark_value) - return state - -def our_write_schema(table_name, schema, pks, stream_alias=None): - global caught_pks - caught_pks[table_name] = pks - caught_schema[table_name] = schema - -def our_write_state(state): - LOGGER.info("our_write_state: {}".format(state)) - global caught_state - caught_state = state - return state - -def our_write_record(table_name, record, stream_alias=None): - if caught_records.get(table_name) == None: - caught_records[table_name] = [] - - caught_records[table_name].append(record) From 5eb3bbcb36554b8d44f078c636988c4b9e0ca6d2 Mon Sep 17 00:00:00 2001 From: JingLin0 Date: Tue, 14 Apr 2020 00:49:13 +0200 Subject: [PATCH 45/78] include major cli functions --- tap_hubspot/__init__.py | 1180 +++------------------------------------ 1 file changed, 80 insertions(+), 1100 deletions(-) diff --git a/tap_hubspot/__init__.py b/tap_hubspot/__init__.py index 392e637c..ffb4a6ba 100644 --- a/tap_hubspot/__init__.py +++ b/tap_hubspot/__init__.py @@ -1,1131 +1,111 @@ #!/usr/bin/env python3 -import datetime -import pytz -import itertools import os -import re import sys import json - -import attr -import backoff -import requests import singer -import singer.messages -import singer.metrics as metrics -from singer import metadata -from singer import utils -from singer import ( - transform, - UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING, - Transformer, - _transform_datetime, -) - -LOGGER = singer.get_logger() -SESSION = requests.Session() - - -class InvalidAuthException(Exception): - pass - - -class SourceUnavailableException(Exception): - pass - - -class DependencyException(Exception): - pass - - -class DataFields: - offset = "offset" - - -class StateFields: - offset = "offset" - this_stream = "this_stream" - - -BASE_URL = "https://api.hubapi.com" - -CONTACTS_BY_COMPANY = "contacts_by_company" - -DEFAULT_CHUNK_SIZE = 1000 * 60 * 60 * 24 - -CONFIG = { - "access_token": None, - "token_expires": None, - "email_chunk_size": DEFAULT_CHUNK_SIZE, - "subscription_chunk_size": DEFAULT_CHUNK_SIZE, - # in config.json - "redirect_uri": None, - "client_id": None, - "client_secret": None, - "refresh_token": None, - "start_date": None, - "hapikey": None, - "include_inactives": None, +from singer import utils, metadata, Catalog, CatalogEntry, Schema +from tap_hubspot.stream import Stream + +KEY_PROPERTIES = "id" +STREAMS = { + "email_events": { + "valid_replication_keys": ["startTimestamp"], + "key_properties": "id", + }, + "forms": {"valid_replication_keys": ["updatedAt"], "key_properties": "guid",}, + "contacts": { + "valid_replication_keys": ["versionTimestamp"], + "key_properties": "vid", + }, + "companies": { + "valid_replication_keys": ["hs_lastmodifieddate"], + "key_properties": "companyId", + }, + "deals": { + "valid_replication_keys": ["hs_lastmodifieddate"], + "key_properties": "dealId", + }, + "deal_pipelines": { + "valid_replication_keys": ["updatedAt"], + "key_properties": "pipelineId", + }, + "engagements": { + "valid_replication_keys": ["lastUpdated"], + "key_properties": "engagement_id", + }, } - -ENDPOINTS = { - "contacts_properties": "/properties/v1/contacts/properties", - "contacts_all": "/contacts/v1/lists/all/contacts/all", - "contacts_recent": "/contacts/v1/lists/recently_updated/contacts/recent", - "contacts_detail": "/contacts/v1/contact/vids/batch/", - "companies_properties": "/companies/v2/properties", - "companies_all": "/companies/v2/companies/paged", - "companies_recent": "/companies/v2/companies/recent/modified", - "companies_detail": "/companies/v2/companies/{company_id}", - "contacts_by_company": "/companies/v2/companies/{company_id}/vids", - "deals_properties": "/properties/v1/deals/properties", - "deals_all": "/deals/v1/deal/paged", - "deals_recent": "/deals/v1/deal/recent/modified", - "deals_detail": "/deals/v1/deal/{deal_id}", - "deal_pipelines": "/crm-pipelines/v1/pipelines/deals", - "campaigns_all": "/email/public/v1/campaigns/by-id", - "campaigns_detail": "/email/public/v1/campaigns/{campaign_id}", - "engagements_all": "/engagements/v1/engagements/paged", - "subscription_changes": "/email/public/v1/subscriptions/timeline", - "email_events": "/email/public/v1/events", - "contact_lists": "/contacts/v1/lists", - "forms": "/forms/v2/forms", - "workflows": "/automation/v3/workflows", - "owners": "/owners/v2/owners", -} - - -def get_start(state, tap_stream_id, bookmark_key): - current_bookmark = singer.get_bookmark(state, tap_stream_id, bookmark_key) - if current_bookmark is None: - return CONFIG["start_date"] - return current_bookmark - - -def get_current_sync_start(state, tap_stream_id): - current_sync_start_value = singer.get_bookmark( - state, tap_stream_id, "current_sync_start" - ) - if current_sync_start_value is None: - return current_sync_start_value - return utils.strptime_to_utc(current_sync_start_value) - - -def write_current_sync_start(state, tap_stream_id, start): - value = start - if start is not None: - value = utils.strftime(start) - return singer.write_bookmark(state, tap_stream_id, "current_sync_start", value) - - -def clean_state(state): - """ Clear deprecated keys out of state. """ - for stream, bookmark_map in state.get("bookmarks", {}).items(): - if "last_sync_duration" in bookmark_map: - LOGGER.info("{} - Removing last_sync_duration from state.".format(stream)) - state["bookmarks"][stream].pop("last_sync_duration", None) - - -def get_url(endpoint, **kwargs): - if endpoint not in ENDPOINTS: - raise ValueError("Invalid endpoint {}".format(endpoint)) - - return BASE_URL + ENDPOINTS[endpoint].format(**kwargs) - - -def replace_na_with_none(obj): - """Given a certain object, the function will replace any 'N/A' values with None. - E.g: object = { - "key1" : [{"subkey1": "value1"}, {"subkey2": "N/A"}], - "key2" : "n/a", - "key3" : { - "subkey3" : "n/a", - "subkey4" : "value2" - } - } - self.replace_na_with_none(object) will return: - { - "key1" : [{"subkey1": "value1"}, {"subkey2": None}], - "key2" : None, - "key3" : { - "subkey3" : None, - "subkey4" : "value2" - } - } - """ - if isinstance(obj, dict): - new_dict = {} - for key, value in obj.items(): - new_dict[key] = replace_na_with_none(value) - return new_dict - - if isinstance(obj, list): - new_list = [] - for value in obj: - new_list.append(replace_na_with_none(value)) - return new_list - - if isinstance(obj, str): - if obj.lower() == "n/a": - obj = None - return obj - - -def get_field_type_schema(field_type): - if field_type == "bool": - return {"type": ["null", "boolean"]} - - elif field_type == "datetime": - return {"type": ["null", "string"], "format": "date-time"} - - elif field_type == "number": - return {"type": ["null", "number"]} - - else: - return {"type": ["null", "string"]} - - -def get_field_schema(field_type, extras=False): - if extras: - return { - "type": "object", - "properties": { - "value": get_field_type_schema(field_type), - "timestamp": get_field_type_schema("datetime"), - "source": get_field_type_schema("string"), - "sourceId": get_field_type_schema("string"), - }, - } - else: - return { - "type": "object", - "properties": {"value": get_field_type_schema(field_type)}, - } - - -def parse_custom_schema(entity_name, data): - return { - field["name"]: get_field_schema(field["type"], entity_name != "contacts") - for field in data - } +REQUIRED_CONFIG_KEYS = [ + "start_date", + "client_id", + "client_secret", + "refresh_token", + "redirect_uri", +] +LOGGER = singer.get_logger() def get_abs_path(path): return os.path.join(os.path.dirname(os.path.realpath(__file__)), path) -def load_schema(entity_name): - schema = utils.load_json(get_abs_path("schemas/{}.json".format(entity_name))) - return schema_nodash(schema) - - -def schema_nodash(obj): - type_field = obj.get("type") - type = get_type(type_field) - if not type: - return obj - if not type in ["array", "object"]: - return obj - if "object" == type: - props = obj.get("properties", {}) - new_props = replace_props(props) - obj["properties"] = new_props - if "array" == type: - items = obj.get("items", {}) - obj["items"] = schema_nodash(items) - return obj - - -def get_type(type_field): - if isinstance(type_field, str): - return type_field - if isinstance(type_field, list): - types = set(type_field) - if "null" in types: - types.remove("null") - return types.pop() - return None - - -def replace_props(props): - if not props: - return props - keys = list(props.keys()) - for k in keys: - if not "-" in k: - props[k] = schema_nodash(props[k]) - else: - v = props.pop(k) - new_key = k.replace("-", "_") - new_value = schema_nodash(v) - props[new_key] = new_value - return props - - -# pylint: disable=invalid-name -def acquire_access_token_from_refresh_token(): - payload = { - "grant_type": "refresh_token", - "redirect_uri": CONFIG["redirect_uri"], - "refresh_token": CONFIG["refresh_token"], - "client_id": CONFIG["client_id"], - "client_secret": CONFIG["client_secret"], - } - - resp = requests.post(BASE_URL + "/oauth/v1/token", data=payload) - if resp.status_code == 403: - raise InvalidAuthException(resp.content) - - resp.raise_for_status() - auth = resp.json() - CONFIG["access_token"] = auth["access_token"] - CONFIG["refresh_token"] = auth["refresh_token"] - CONFIG["token_expires"] = datetime.datetime.utcnow() + datetime.timedelta( - seconds=auth["expires_in"] - 600 - ) - LOGGER.info("Token refreshed. Expires at %s", CONFIG["token_expires"]) - - -def giveup(exc): - return ( - exc.response is not None - and 400 <= exc.response.status_code < 500 - and exc.response.status_code != 429 - ) - - -def on_giveup(details): - if len(details["args"]) == 2: - url, params = details["args"] - else: - url = details["args"] - params = {} - - raise Exception( - "Giving up on request after {} tries with url {} and params {}".format( - details["tries"], url, params - ) - ) - - -URL_SOURCE_RE = re.compile(BASE_URL + r"/(\w+)/") - - -def parse_source_from_url(url): - match = URL_SOURCE_RE.match(url) - if match: - return match.group(1) - return None - - -@backoff.on_exception( - backoff.constant, - (requests.exceptions.RequestException, requests.exceptions.HTTPError), - max_tries=5, - jitter=None, - giveup=giveup, - on_giveup=on_giveup, - interval=10, -) -def request(url, params=None): - - params = params or {} - hapikey = CONFIG["hapikey"] - if hapikey is None: - if ( - CONFIG["token_expires"] is None - or CONFIG["token_expires"] < datetime.datetime.utcnow() - ): - acquire_access_token_from_refresh_token() - headers = {"Authorization": "Bearer {}".format(CONFIG["access_token"])} - else: - params["hapikey"] = hapikey - headers = {} - - if "user_agent" in CONFIG: - headers["User-Agent"] = CONFIG["user_agent"] - - req = requests.Request("GET", url, params=params, headers=headers).prepare() - #LOGGER.info("GET %s", req.url) - with metrics.http_request_timer(parse_source_from_url(url)) as timer: - resp = SESSION.send(req) - timer.tags[metrics.Tag.http_status_code] = resp.status_code - if resp.status_code == 403: - raise SourceUnavailableException(resp.content) - else: - resp.raise_for_status() - - return resp - - -# {"bookmarks" : {"contacts" : { "lastmodifieddate" : "2001-01-01" -# "offset" : {"vidOffset": 1234 -# "timeOffset": "3434434 }} -# "users" : { "timestamp" : "2001-01-01"}} -# "currently_syncing" : "contacts" -# } -# } - -# pylint: disable=line-too-long -def gen_request( - STATE, tap_stream_id, url, params, path, more_key, offset_keys, offset_targets -): - if len(offset_keys) != len(offset_targets): - raise ValueError("Number of offset_keys must match number of offset_targets") - - if singer.get_offset(STATE, tap_stream_id): - params.update(singer.get_offset(STATE, tap_stream_id)) - - with metrics.record_counter(tap_stream_id) as counter: - while True: - data = request(url, params).json() - - for row in data[path]: - counter.increment() - yield row - - if not data.get(more_key, False): - break - - STATE = singer.clear_offset(STATE, tap_stream_id) - for key, target in zip(offset_keys, offset_targets): - if key in data: - params[target] = data[key] - STATE = singer.set_offset(STATE, tap_stream_id, target, data[key]) - - singer.write_state(STATE) - - STATE = singer.clear_offset(STATE, tap_stream_id) - singer.write_state(STATE) - - -def _sync_contact_vids(catalog, vids, schema, bumble_bee): - if len(vids) == 0: - return - - data = request( - get_url("contacts_detail"), - params={"vid": vids, "showListMemberships": True, "formSubmissionMode": "all"}, - ).json() - time_extracted = utils.now() - mdata = metadata.to_map(catalog.get("metadata")) - - for record in data.values(): - record = replace_na_with_none(record) - record = bumble_bee.transform(record, schema, mdata) - record = record_nodash(record) - singer.write_record( - "contacts", - record, - catalog.get("stream_alias"), - time_extracted=time_extracted, - ) - - -def record_nodash(obj): - if not isinstance(obj, dict): # stopplesing criteria - return obj - - for k in obj.keys(): - value = record_nodash(obj[k]) - if not "-" in k: - key = k - else: - obj.pop(k) - key = k.replace("-", "_") - - obj[key] = value # recursion - - return obj - - -default_contact_params = { - "showListMemberships": True, - "includeVersion": True, - "count": 100, -} - - -def sync_contacts(STATE, ctx): - catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) - bookmark_key = "versionTimestamp" - start = utils.strptime_with_tz(get_start(STATE, "contacts", bookmark_key)) - LOGGER.info("sync_contacts from %s", start) - - max_bk_value = start - schema = catalog["schema"] - - singer.write_schema( - "contacts", schema, ["vid"], [bookmark_key], catalog.get("stream_alias") - ) - - url = get_url("contacts_all") - - vids = [] - with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: - for row in gen_request( - STATE, - "contacts", - url, - default_contact_params, - "contacts", - "has-more", - ["vid-offset"], - ["vidOffset"], - ): - modified_time = None - if bookmark_key in row: - modified_time = utils.strptime_with_tz( - _transform_datetime( # pylint: disable=protected-access - row[bookmark_key], UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING - ) - ) - - if not modified_time or modified_time > start: - vids.append(row["vid"]) +def load_schemas(): + schemas = {} - if modified_time and modified_time >= max_bk_value: - max_bk_value = modified_time + for filename in os.listdir(get_abs_path("schemas")): + path = get_abs_path("schemas") + "/" + filename + file_raw = filename.replace(".json", "") + with open(path) as file: + schemas[file_raw] = json.load(file) - if len(vids) == 100: - _sync_contact_vids(catalog, vids, schema, bumble_bee) - vids = [] + return schemas - _sync_contact_vids(catalog, vids, schema, bumble_bee) - STATE = singer.write_bookmark( - STATE, "contacts", bookmark_key, utils.strftime(max_bk_value) - ) - singer.write_state(STATE) - return STATE +def discover() -> Catalog: + schemas = load_schemas() + streams = [] - -class ValidationPredFailed(Exception): - pass - - -# companies_recent only supports 10,000 results. If there are more than this, -# we'll need to use the companies_all endpoint -def use_recent_companies_endpoint(response): - return response["total"] < 10000 - - -default_company_params = { - "limit": 250, - "properties": [ - "website", - "name", - "country", - "domain", - "createdate", - "hs_lastmodifieddate", - ], -} - - -def sync_companies(STATE, ctx): - catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) - mdata = metadata.to_map(catalog.get("metadata")) - bumble_bee = Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) - bookmark_key = "hs_lastmodifieddate" - start = utils.strptime_to_utc(get_start(STATE, "companies", bookmark_key)) - LOGGER.info("sync_companies from %s", start) - schema = catalog["schema"] - singer.write_schema( - "companies", schema, ["companyId"], [bookmark_key], catalog.get("stream_alias") - ) - - # Because this stream doesn't query by `lastUpdated`, it cycles - # through the data set every time. The issue with this is that there - # is a race condition by which records may be updated between the - # start of this table's sync and the end, causing some updates to not - # be captured, in order to combat this, we must store the current - # sync's start in the state and not move the bookmark past this value. - current_sync_start = get_current_sync_start(STATE, "companies") or utils.now() - STATE = write_current_sync_start(STATE, "companies", current_sync_start) - singer.write_state(STATE) - - url = get_url("companies_all") - max_bk_value = start - - with bumble_bee: - for row in gen_request( - STATE, - "companies", - url, - default_company_params, - "companies", - "has-more", - ["offset"], - ["offset"], - ): - row_properties = row["properties"] - modified_time = None - if bookmark_key in row_properties: - # Hubspot returns timestamps in millis - timestamp_millis = row_properties[bookmark_key]["timestamp"] / 1000.0 - modified_time = datetime.datetime.fromtimestamp( - timestamp_millis, datetime.timezone.utc - ) - elif "createdate" in row_properties: - # Hubspot returns timestamps in millis - timestamp_millis = row_properties["createdate"]["timestamp"] / 1000.0 - modified_time = datetime.datetime.fromtimestamp( - timestamp_millis, datetime.timezone.utc - ) - - if modified_time and modified_time >= max_bk_value: - max_bk_value = modified_time - if not modified_time or modified_time > start: - record = bumble_bee.transform(row, schema, mdata) - singer.write_record( - "companies", - record, - catalog.get("stream_alias"), - time_extracted=utils.now(), - ) - # Don't bookmark past the start of this sync to account for updated records during the sync. - new_bookmark = min(max_bk_value, current_sync_start) - STATE = singer.write_bookmark( - STATE, "companies", bookmark_key, utils.strftime(new_bookmark) - ) - STATE = write_current_sync_start(STATE, "companies", None) - singer.write_state(STATE) - return STATE - - -def sync_deals(STATE, ctx): - catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) - mdata = metadata.to_map(catalog.get("metadata")) - bookmark_key = "hs_lastmodifieddate" - start = utils.strptime_with_tz(get_start(STATE, "deals", bookmark_key)) - max_bk_value = start - LOGGER.info("sync_deals from %s", start) - params = {"count": 250, "includeAssociations": False, "properties": []} - - schema = load_schema("deals") - singer.write_schema( - "deals", schema, ["dealId"], [bookmark_key], catalog.get("stream_alias") - ) - - # Check if we should include associations - for key in mdata.keys(): - if "associations" in key: - assoc_mdata = mdata.get(key) - if assoc_mdata.get("selected") and assoc_mdata.get("selected") == True: - params["includeAssociations"] = True - - # Append all the properties fields for deals to the request if - # properties is selectedOB - if mdata.get(("properties", "properties"), {}).get("selected"): - additional_properties = ( - schema.get("properties").get("properties").get("properties") + for tap_stream_id, props in STREAMS.items(): + schema = schemas[tap_stream_id] + mdata = metadata.get_standard_metadata( + schema=schema, + key_properties=props.get("key_properties", None), + valid_replication_keys=props.get("valid_replication_keys", []), ) - for key in additional_properties.keys(): - params["properties"].append(key) - - url = get_url("deals_all") - with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: - for row in gen_request( - STATE, "deals", url, params, "deals", "hasMore", ["offset"], ["offset"] - ): - row_properties = row["properties"] - modified_time = None - if bookmark_key in row_properties: - # Hubspot returns timestamps in millis - timestamp_millis = row_properties[bookmark_key]["timestamp"] / 1000.0 - modified_time = datetime.datetime.fromtimestamp( - timestamp_millis, datetime.timezone.utc - ) - elif "createdate" in row_properties: - # Hubspot returns timestamps in millis - timestamp_millis = row_properties["createdate"]["timestamp"] / 1000.0 - modified_time = datetime.datetime.fromtimestamp( - timestamp_millis, datetime.timezone.utc - ) - if modified_time and modified_time >= max_bk_value: - max_bk_value = modified_time - - if not modified_time or modified_time > start: - record = bumble_bee.transform(row, schema, mdata) - singer.write_record( - "deals", - record, - catalog.get("stream_alias"), - time_extracted=utils.now(), - ) - - STATE = singer.write_bookmark( - STATE, "deals", bookmark_key, utils.strftime(max_bk_value) - ) - singer.write_state(STATE) - return STATE - - -# NB> no suitable bookmark is available: https://developers.hubspot.com/docs/methods/email/get_campaigns_by_id -def sync_campaigns(STATE, ctx): - catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) - mdata = metadata.to_map(catalog.get("metadata")) - schema = catalog["schema"] - singer.write_schema("campaigns", schema, ["id"], catalog.get("stream_alias")) - LOGGER.info("sync_campaigns(NO bookmarks)") - url = get_url("campaigns_all") - params = {"limit": 500} - - with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: - for row in gen_request( - STATE, - "campaigns", - url, - params, - "campaigns", - "hasMore", - ["offset"], - ["offset"], - ): - record = request(get_url("campaigns_detail", campaign_id=row["id"])).json() - record = replace_na_with_none(record) - record = bumble_bee.transform(record, schema, mdata) - singer.write_record( - "campaigns", - record, - catalog.get("stream_alias"), - time_extracted=utils.now(), + streams.append( + CatalogEntry( + stream=tap_stream_id, + tap_stream_id=tap_stream_id, + key_properties=KEY_PROPERTIES, + schema=Schema.from_dict(schema), + metadata=mdata, ) - - return STATE - - -def sync_entity_chunked(STATE, catalog, entity_name, key_properties, path): - schema = catalog["schema"] - bookmark_key = "startTimestamp" - - singer.write_schema( - entity_name, schema, key_properties, [bookmark_key], catalog.get("stream_alias") - ) - - start = get_start(STATE, entity_name, bookmark_key) - LOGGER.info("sync_%s from %s", entity_name, start) - - now = datetime.datetime.utcnow().replace(tzinfo=pytz.UTC) - now_ts = int(now.timestamp() * 1000) - - start_ts = int(utils.strptime_with_tz(start).timestamp() * 1000) - url = get_url(entity_name) - - mdata = metadata.to_map(catalog.get("metadata")) - - if entity_name == "email_events": - window_size = int(CONFIG["email_chunk_size"]) - elif entity_name == "subscription_changes": - window_size = int(CONFIG["subscription_chunk_size"]) - - with metrics.record_counter(entity_name) as counter: - while start_ts < now_ts: - end_ts = start_ts + window_size - params = {"startTimestamp": start_ts, "endTimestamp": end_ts, "limit": 1000} - with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: - while True: - our_offset = singer.get_offset(STATE, entity_name) - if bool(our_offset) and our_offset.get("offset") != None: - params[StateFields.offset] = our_offset.get("offset") - - data = request(url, params).json() - time_extracted = utils.now() - - for row in data[path]: - counter.increment() - row = replace_na_with_none(row) - record = bumble_bee.transform(row, schema, mdata) - singer.write_record( - entity_name, - record, - catalog.get("stream_alias"), - time_extracted=time_extracted, - ) - if data.get("hasMore"): - STATE = singer.set_offset( - STATE, entity_name, "offset", data["offset"] - ) - singer.write_state(STATE) - else: - STATE = singer.clear_offset(STATE, entity_name) - singer.write_state(STATE) - break - STATE = singer.write_bookmark( - STATE, - entity_name, - "startTimestamp", - utils.strftime( - datetime.datetime.fromtimestamp( - (start_ts / 1000), datetime.timezone.utc - ) - ), - ) # pylint: disable=line-too-long - singer.write_state(STATE) - start_ts = end_ts - - STATE = singer.clear_offset(STATE, entity_name) - singer.write_state(STATE) - return STATE - - -def sync_email_events(STATE, ctx): - catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) - STATE = sync_entity_chunked(STATE, catalog, "email_events", ["id"], "events") - return STATE - - -def sync_forms(STATE, ctx): - catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) - mdata = metadata.to_map(catalog.get("metadata")) - schema = load_schema("forms") - bookmark_key = "updatedAt" - - singer.write_schema( - "forms", schema, ["guid"], [bookmark_key], catalog.get("stream_alias") - ) - start = get_start(STATE, "forms", bookmark_key) - max_bk_value = start - - LOGGER.info("sync_forms from %s", start) - - data = request(get_url("forms")).json() - time_extracted = utils.now() - - with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: - for row in data: - row = replace_na_with_none(row) - record = bumble_bee.transform(row, schema, mdata) - - if record[bookmark_key] > start: - singer.write_record( - "forms", - record, - catalog.get("stream_alias"), - time_extracted=time_extracted, - ) - if record[bookmark_key] >= max_bk_value: - max_bk_value = record[bookmark_key] - - STATE = singer.write_bookmark(STATE, "forms", bookmark_key, max_bk_value) - singer.write_state(STATE) - - return STATE - - -def sync_engagements(STATE, ctx): - catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) - mdata = metadata.to_map(catalog.get("metadata")) - schema = catalog["schema"] - bookmark_key = "lastUpdated" - singer.write_schema( - "engagements", - schema, - ["engagement_id"], - [bookmark_key], - catalog.get("stream_alias"), - ) - start = get_start(STATE, "engagements", bookmark_key) - - # Because this stream doesn't query by `lastUpdated`, it cycles - # through the data set every time. The issue with this is that there - # is a race condition by which records may be updated between the - # start of this table's sync and the end, causing some updates to not - # be captured, in order to combat this, we must store the current - # sync's start in the state and not move the bookmark past this value. - current_sync_start = get_current_sync_start(STATE, "engagements") or utils.now() - STATE = write_current_sync_start(STATE, "engagements", current_sync_start) - singer.write_state(STATE) - - max_bk_value = start - LOGGER.info("sync_engagements from %s", start) - - STATE = singer.write_bookmark(STATE, "engagements", bookmark_key, start) - singer.write_state(STATE) - - url = get_url("engagements_all") - params = {"limit": 250} - top_level_key = "results" - engagements = gen_request( - STATE, - "engagements", - url, - params, - top_level_key, - "hasMore", - ["offset"], - ["offset"], - ) - - time_extracted = utils.now() - - with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: - for engagement in engagements: - engagement = replace_na_with_none(engagement) - record = bumble_bee.transform(engagement, schema, mdata) - if record["engagement"][bookmark_key] > start: - # hoist PK and bookmark field to top-level record - record["engagement_id"] = record["engagement"]["id"] - record[bookmark_key] = record["engagement"][bookmark_key] - singer.write_record( - "engagements", - record, - catalog.get("stream_alias"), - time_extracted=time_extracted, - ) - if record["engagement"][bookmark_key] >= max_bk_value: - max_bk_value = record["engagement"][bookmark_key] - - # Don't bookmark past the start of this sync to account for updated records during the sync. - new_bookmark = min(utils.strptime_to_utc(max_bk_value), current_sync_start) - STATE = singer.write_bookmark( - STATE, "engagements", bookmark_key, utils.strftime(new_bookmark) - ) - STATE = write_current_sync_start(STATE, "engagements", None) - singer.write_state(STATE) - return STATE - - -def sync_deal_pipelines(STATE, ctx): - bookmark_key = "updatedAt" - start = utils.strptime_with_tz(get_start(STATE, "deal_pipelines", bookmark_key)) - max_bk_value = start - catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) - mdata = metadata.to_map(catalog.get("metadata")) - schema = catalog["schema"] - singer.write_schema( - "deal_pipelines", schema, ["pipelineId"], catalog.get("stream_alias") - ) - LOGGER.info(f"sync deal_pipelines from {start}") - - data = request(get_url("deal_pipelines")).json() - with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: - for row in data["results"]: - modified_time = None - row = replace_na_with_none(row) - if bookmark_key in row: - timestamp_millis = row[bookmark_key] - elif "createdAt" in row: - # Hubspot returns timestamps in millis - timestamp_millis = row["createdAt"] - modified_time = datetime.datetime.fromtimestamp( - timestamp_millis / 1000.0, datetime.timezone.utc - ) - if modified_time and modified_time >= max_bk_value: - max_bk_value = modified_time - - if not modified_time or modified_time > start: - record = bumble_bee.transform(row, schema, mdata) - singer.write_record( - "deal_pipelines", - record, - catalog.get("stream_alias"), - time_extracted=utils.now(), - ) - STATE = singer.write_bookmark( - STATE, "deal_pipelines", bookmark_key, utils.strftime(max_bk_value) - ) - singer.write_state(STATE) - return STATE - - -@attr.s -class Stream(object): - tap_stream_id = attr.ib() - sync = attr.ib() - key_properties = attr.ib() - replication_key = attr.ib() - replication_method = attr.ib() - - -STREAMS = [ - Stream("email_events", sync_email_events, ["id"], "startTimestamp", "INCREMENTAL"), - # Do these last as they are full table - Stream("forms", sync_forms, ["guid"], "updatedAt", "FULL_TABLE"), - Stream("contacts", sync_contacts, ["vid"], "versionTimestamp", "FULL_TABLE"), - Stream( - "companies", sync_companies, ["companyId"], "hs_lastmodifieddate", "FULL_TABLE" - ), - Stream("deals", sync_deals, ["dealId"], "hs_lastmodifieddate", "FULL_TABLE"), - Stream( - "deal_pipelines", sync_deal_pipelines, ["pipelineId"], "updatedAt", "FULL_TABLE" - ), - Stream( - "engagements", sync_engagements, ["engagement_id"], "lastUpdated", "FULL_TABLE" - ), -] - - -def get_streams_to_sync(streams, state): - target_stream = singer.get_currently_syncing(state) - result = streams - if target_stream: - skipped = list( - itertools.takewhile(lambda x: x.tap_stream_id != target_stream, streams) - ) - rest = list( - itertools.dropwhile(lambda x: x.tap_stream_id != target_stream, streams) - ) - result = rest + skipped # Move skipped streams to end - if not result: - raise Exception("Unknown stream {} in state".format(target_stream)) - return result - - -def get_selected_streams(remaining_streams, ctx): - selected_streams = [] - for stream in remaining_streams: - if stream.tap_stream_id in ctx.selected_stream_ids: - selected_streams.append(stream) - return selected_streams - - -def do_sync(STATE, catalog): - # Clear out keys that are no longer used - clean_state(STATE) - - ctx = Context(catalog) - validate_dependencies(ctx) - - remaining_streams = get_streams_to_sync(STREAMS, STATE) - selected_streams = get_selected_streams(remaining_streams, ctx) - LOGGER.info( - "Starting sync. Will sync these streams: %s", - [stream.tap_stream_id for stream in selected_streams], - ) - for stream in selected_streams: - LOGGER.info("Syncing %s", stream.tap_stream_id) - STATE = singer.set_currently_syncing(STATE, stream.tap_stream_id) - singer.write_state(STATE) - - try: - STATE = stream.sync(STATE, ctx) # pylint: disable=not-callable - except SourceUnavailableException as ex: - error_message = str(ex).replace(CONFIG["access_token"], 10 * "*") - LOGGER.error(error_message) - pass - - STATE = singer.set_currently_syncing(STATE, None) - singer.write_state(STATE) - LOGGER.info("Sync completed") - - -class Context(object): - def __init__(self, catalog): - self.selected_stream_ids = set() - - for stream in catalog.get("streams"): - mdata = metadata.to_map(stream["metadata"]) - if metadata.get(mdata, (), "selected"): - self.selected_stream_ids.add(stream["tap_stream_id"]) - - self.catalog = catalog - - def get_catalog_from_id(self, tap_stream_id): - return [ - c for c in self.catalog.get("streams") if c.get("stream") == tap_stream_id - ][0] - - -# stream a is dependent on stream STREAM_DEPENDENCIES[a] -STREAM_DEPENDENCIES = {CONTACTS_BY_COMPANY: "companies"} - - -def validate_dependencies(ctx): - errs = [] - msg_tmpl = ( - "Unable to extract {0} data. " - "To receive {0} data, you also need to select {1}." - ) - - for k, v in STREAM_DEPENDENCIES.items(): - if k in ctx.selected_stream_ids and v not in ctx.selected_stream_ids: - errs.append(msg_tmpl.format(k, v)) - if errs: - raise DependencyException(" ".join(errs)) - - -def load_discovered_schema(stream): - schema = load_schema(stream.tap_stream_id) - mdata = metadata.new() - - mdata = metadata.write(mdata, (), "table-key-properties", stream.key_properties) - mdata = metadata.write( - mdata, (), "forced-replication-method", stream.replication_method - ) - - if stream.replication_key: - mdata = metadata.write( - mdata, (), "valid-replication-keys", [stream.replication_key] ) + return Catalog(streams) - for field_name in schema["properties"]: - if field_name in stream.key_properties or field_name == stream.replication_key: - mdata = metadata.write( - mdata, ("properties", field_name), "inclusion", "automatic" - ) - else: - mdata = metadata.write( - mdata, ("properties", field_name), "inclusion", "available" - ) - # The engagements stream has nested data that we synthesize; The engagement field needs to be automatic - if stream.tap_stream_id == "engagements": - mdata = metadata.write( - mdata, ("properties", "engagement"), "inclusion", "automatic" - ) +def sync(catalog, config, state=None): + for catalog_entry in catalog.streams: + if not catalog_entry.is_selected(): + continue + LOGGER.info(f"syncing {catalog_entry.tap_stream_id}") + stream = Stream(catalog_entry, config) + stream.do_sync(state) - return schema, metadata.to_list(mdata) +@utils.handle_top_exception(LOGGER) +def main(): -def discover_schemas(): - result = {"streams": []} - for stream in STREAMS: - LOGGER.info("Loading schema for %s", stream.tap_stream_id) - schema, mdata = load_discovered_schema(stream) - result["streams"].append( - { - "stream": stream.tap_stream_id, - "tap_stream_id": stream.tap_stream_id, - "schema": schema, - "metadata": mdata, - } - ) - - return result - - -def do_discover(): - LOGGER.info("Loading schemas") - json.dump(discover_schemas(), sys.stdout, indent=4) - - -def main_impl(): - args = utils.parse_args( - ["redirect_uri", "client_id", "client_secret", "refresh_token", "start_date"] - ) - - CONFIG.update(args.config) - STATE = {} - - if args.state: - STATE.update(args.state) + args = utils.parse_args(REQUIRED_CONFIG_KEYS) if args.discover: - do_discover() - elif args.properties: - do_sync(STATE, args.properties) + catalog = discover() + catalog.dump() else: - LOGGER.info("No properties were selected") - - -def main(): - try: - main_impl() - except Exception as exc: - LOGGER.critical(exc) - raise exc + if args.catalog: + catalog = args.catalog + else: + catalog = discover() + sync(catalog, args.config, args.state) if __name__ == "__main__": From 0d1613db6d4294f89446dd5ef8a9fedd4b13c410 Mon Sep 17 00:00:00 2001 From: JingLin0 Date: Tue, 14 Apr 2020 00:49:52 +0200 Subject: [PATCH 46/78] handle all api calls and get replication values --- tap_hubspot/hubspot.py | 173 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 173 insertions(+) create mode 100644 tap_hubspot/hubspot.py diff --git a/tap_hubspot/hubspot.py b/tap_hubspot/hubspot.py new file mode 100644 index 00000000..09c38dfb --- /dev/null +++ b/tap_hubspot/hubspot.py @@ -0,0 +1,173 @@ +import requests +from dateutil import parser +import time +from ratelimit import limits +import ratelimit +import singer +import backoff +import sys +import datetime +from singer import utils + +LOGGER = singer.get_logger() + + +class Hubspot: + SESSION = requests.Session() + BASE_URL = "https://api.hubapi.com" + ENDPOINTS = { + "companies": "/companies/v2/companies/paged", + "contacts": "/contacts/v1/lists/all/contacts/all", + "deal_pipelines": "/crm-pipelines/v1/pipelines/deals", + "deals": "/deals/v1/deal/paged", + "email_events": "/email/public/v1/events", + "engagements": "/engagements/v1/engagements/paged", + "forms": "/forms/v2/forms", + } + DATA_PATH = { + "companies": "companies", + "contacts": "contacts", + "deal_pipelines": "results", + "deals": "deals", + "email_events": "events", + "engagements": "results", + } + REPLICATION_PATH = { + "companies": ["properties", "hs_lastmodifieddate", "timestamp",], + "contacts": ["properties", "lastmodifieddate", "value"], + "deal_pipelines": ["updatedAt"], + "deals": ["properties", "hs_lastmodifieddate", "timestamp"], + "email_events": ["created"], + "engagements": ["engagement", "lastUpdated"], + "forms": ["updatedAt"], + } + LIMIT = 250 + + def __init__(self, config, tap_stream_id, properties): + self.access_token = None + self.tap_stream_id = tap_stream_id + self.config = config + self.refresh_access_token() + self.endpoint = self.ENDPOINTS[tap_stream_id] + self.offset_value = None + self.offset_key = None + self.hasmore = True + self.PARAMS = { + "companies": {"limit": self.LIMIT, "properties": properties,}, + "contacts": { + "showListMemberships": True, + "includeVersion": True, + "count": self.LIMIT, + }, + "engagements": {"limit": self.LIMIT}, + "deals": { + "count": self.LIMIT, + "includeAssociations": False, + "properties": properties, + "limit": self.LIMIT, + }, + } + + def get_url_params(self, start_date, end_date): + url = f"{self.BASE_URL}{self.endpoint}" + params = self.PARAMS.get(self.tap_stream_id, {}) + if self.tap_stream_id == "email_events": + params = {"startTimestamp": start_date, "endTimestamp": end_date} + if self.offset_value: + params[self.offset_key] = self.offset_value + return url, params + + def get_replication_value( + self, obj: dict, path_to_replication_key=None, default=None + ): + if not path_to_replication_key: + return default + for path_element in path_to_replication_key: + obj = obj.get(path_element) + if not obj: + return default + return self.milliseconds_to_datetime(obj) + + def milliseconds_to_datetime(self, ms): + return ( + datetime.datetime.fromtimestamp((int(ms) / 1000.0), datetime.timezone.utc) + if ms + else None + ) + + def datetime_to_milliseconds(self, d: datetime.datetime): + return int(d.timestamp() * 1000) if d else None + + def get_records(self, start_date, end_date): + while self.hasmore: + url, params = self.get_url_params(start_date, end_date) + records = self.call_api(url, params=params) + if records: + replication_value = map( + lambda record: self.get_replication_value( + obj=record, + path_to_replication_key=self.REPLICATION_PATH.get( + self.tap_stream_id + ), + ), + records, + ) + yield from zip(records, replication_value) + else: + break + + def streams(self, start_date, end_date): + start_date = self.datetime_to_milliseconds(start_date) + end_date = self.datetime_to_milliseconds(end_date) + yield from self.get_records(start_date, end_date) + + @backoff.on_exception( + backoff.expo, + ( + requests.exceptions.RequestException, + requests.exceptions.HTTPError, + ratelimit.exception.RateLimitException, + ), + ) + @limits(calls=100, period=10) + def call_api(self, url, params={}): + response = self.SESSION.get( + url, headers={"Authorization": f"Bearer {self.access_token}"}, params=params + ) + LOGGER.info(response.url) + response.raise_for_status() + data = self.get_offset(response.json()) + + return data + + def get_offset(self, data): + data_path = self.DATA_PATH.get(self.tap_stream_id) + if isinstance(data, list): + self.hasmore = False + return data + + if self.tap_stream_id == "deal_pipelines": + self.hasmore = False + + offset = [k for k in data.keys() if k.endswith("offset")] + if offset: + offset = offset[0] + self.offset_value = data.get(offset) + self.offset_key = "vidOffset" if offset == "vid-offset" else "offset" + data = data[data_path] if data_path else data + + return data + + def refresh_access_token(self): + payload = { + "grant_type": "refresh_token", + "refresh_token": self.config["refresh_token"], + "client_id": self.config["client_id"], + "client_secret": self.config["client_secret"], + } + + resp = requests.post(self.BASE_URL + "/oauth/v1/token", data=payload) + resp.raise_for_status() + if not resp: + raise Exception(resp.text) + self.access_token = resp.json()["access_token"] From d53acf23bbb14ad2b1b9066139a803967c32e0b4 Mon Sep 17 00:00:00 2001 From: JingLin0 Date: Tue, 14 Apr 2020 00:50:20 +0200 Subject: [PATCH 47/78] write singer record, schema and state --- tap_hubspot/stream.py | 118 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100644 tap_hubspot/stream.py diff --git a/tap_hubspot/stream.py b/tap_hubspot/stream.py new file mode 100644 index 00000000..0cb8bb8e --- /dev/null +++ b/tap_hubspot/stream.py @@ -0,0 +1,118 @@ +import singer +from singer import ( + metadata, + CatalogEntry, + Transformer, + UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING, + _transform_datetime, +) +from typing import Union +from datetime import timedelta, datetime +from dateutil import parser +from tap_hubspot.hubspot import Hubspot +import pytz + +LOGGER = singer.get_logger() + + +class Stream: + def __init__(self, catalog: CatalogEntry, config): + self.tap_stream_id = catalog.tap_stream_id + self.schema = catalog.schema.to_dict() + self.key_properties = catalog.key_properties + self.mdata = metadata.to_map(catalog.metadata) + self.bookmark_key = self.mdata.get(()).get("valid-replication-keys")[0] + self.config = config + self.hubspot = Hubspot(config, self.tap_stream_id, self.get_properties()) + + def get_properties(self): + properties = [] + if self.mdata.get(("properties", "properties"), {}).get("selected"): + additional_properties = ( + self.schema.get("properties").get("properties").get("properties") + ) + properties = [key for key in additional_properties.keys()] + return properties + + def do_sync(self, state): + singer.write_schema( + self.tap_stream_id, self.schema, self.key_properties, + ) + prev_bookmark = None + start_date, end_date = self.__get_start_end(state) + with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as transformer: + try: + data = self.hubspot.streams(start_date, end_date) + for d, replication_value in data: + if not replication_value: + record = transformer.transform(d, self.schema, self.mdata) + singer.write_record(self.tap_stream_id, record) + + elif (start_date >= replication_value) or ( + end_date <= replication_value + ): + continue + + else: + record = transformer.transform(d, self.schema, self.mdata) + singer.write_record(self.tap_stream_id, record) + new_bookmark = replication_value + if not prev_bookmark: + prev_bookmark = new_bookmark + + if prev_bookmark < new_bookmark: + state = self.__advance_bookmark(state, prev_bookmark) + prev_bookmark = new_bookmark + return self.__advance_bookmark(state, prev_bookmark) + + except Exception: + self.__advance_bookmark(state, prev_bookmark) + raise + + def __get_start_end(self, state: dict): + end_date = pytz.utc.localize(datetime.utcnow()) + LOGGER.info(f"sync data until: {end_date}") + + config_start_date = self.config.get("start_date") + if config_start_date: + config_start_date = parser.isoparse(config_start_date) + else: + config_start_date = datetime.utcnow() + timedelta(weeks=4) + + if not state: + LOGGER.info(f"using 'start_date' from config: {config_start_date}") + return config_start_date, end_date + + account_record = state["bookmarks"].get(self.tap_stream_id, None) + if not account_record: + LOGGER.info(f"using 'start_date' from config: {config_start_date}") + return config_start_date, end_date + + current_bookmark = account_record.get(self.bookmark_key, None) + if not current_bookmark: + LOGGER.info(f"using 'start_date' from config: {config_start_date}") + return config_start_date, end_date + + start_date = parser.isoparse(current_bookmark) + LOGGER.info(f"using 'start_date' from previous state: {start_date}") + return start_date, end_date + + def __advance_bookmark(self, state: dict, bookmark: Union[str, datetime, None]): + if not bookmark: + singer.write_state(state) + return state + + if isinstance(bookmark, datetime): + bookmark_datetime = bookmark + elif isinstance(bookmark, str): + bookmark_datetime = parser.isoparse(bookmark) + else: + raise ValueError( + f"bookmark is of type {type(bookmark)} but must be either string or datetime" + ) + + state = singer.write_bookmark( + state, self.tap_stream_id, self.bookmark_key, bookmark_datetime.isoformat() + ) + singer.write_state(state) + return state From 7d4d2c38626dc96d2ac577cf8b403cbd8383cb96 Mon Sep 17 00:00:00 2001 From: JingLin0 Date: Tue, 14 Apr 2020 00:54:05 +0200 Subject: [PATCH 48/78] delete unused package --- setup.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/setup.py b/setup.py index 851f6708..d6f80ef6 100644 --- a/setup.py +++ b/setup.py @@ -11,12 +11,9 @@ classifiers=["Programming Language :: Python :: 3 :: Only"], py_modules=["tap_hubspot"], install_requires=[ - "attrs>=16.3.0, <19", "singer-python>=5.1.1, <5.9", "requests==2.22.0", "backoff>=1.3.2, <2", - "requests_mock==1.3.0", - "nose", ], entry_points=""" [console_scripts] From 030f947a041236fccf2f09a81e0394b9f75648bc Mon Sep 17 00:00:00 2001 From: JingLin0 Date: Tue, 14 Apr 2020 00:56:17 +0200 Subject: [PATCH 49/78] add ratelimit package --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index d6f80ef6..12c5159d 100644 --- a/setup.py +++ b/setup.py @@ -14,6 +14,7 @@ "singer-python>=5.1.1, <5.9", "requests==2.22.0", "backoff>=1.3.2, <2", + "ratelimit==2.2.1", ], entry_points=""" [console_scripts] From ea481842f8da96a28d531fa343c511dabddb72ff Mon Sep 17 00:00:00 2001 From: JingLin0 Date: Tue, 14 Apr 2020 09:22:53 +0200 Subject: [PATCH 50/78] simplify package data --- setup.py | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/setup.py b/setup.py index 12c5159d..eba2df32 100644 --- a/setup.py +++ b/setup.py @@ -21,20 +21,6 @@ tap-hubspot=tap_hubspot:main """, packages=["tap_hubspot"], - package_data={ - "tap_hubspot/schemas": [ - "campaigns.json", - "companies.json", - "contact_lists.json", - "contacts.json", - "deals.json", - "email_events.json", - "forms.json", - "keywords.json", - "owners.json", - "subscription_changes.json", - "workflows.json", - ], - }, + package_data={"tap_hubspot/schemas": ["*.json"]}, include_package_data=True, ) From 9a4a7c5983b014d5e8225d05681e3f5d6f5c0912 Mon Sep 17 00:00:00 2001 From: JingLin0 Date: Tue, 14 Apr 2020 16:20:28 +0200 Subject: [PATCH 51/78] cr: make path adapt to all os by using pathlib --- tap_hubspot/__init__.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/tap_hubspot/__init__.py b/tap_hubspot/__init__.py index ffb4a6ba..5f4b2e71 100644 --- a/tap_hubspot/__init__.py +++ b/tap_hubspot/__init__.py @@ -5,6 +5,7 @@ import singer from singer import utils, metadata, Catalog, CatalogEntry, Schema from tap_hubspot.stream import Stream +from pathlib import Path KEY_PROPERTIES = "id" STREAMS = { @@ -44,18 +45,12 @@ LOGGER = singer.get_logger() -def get_abs_path(path): - return os.path.join(os.path.dirname(os.path.realpath(__file__)), path) - - def load_schemas(): schemas = {} - - for filename in os.listdir(get_abs_path("schemas")): - path = get_abs_path("schemas") + "/" + filename - file_raw = filename.replace(".json", "") - with open(path) as file: - schemas[file_raw] = json.load(file) + schemas_path = Path(__file__).parent.absolute() / "schemas" + for schema_path in schemas_path.iterdir(): + stream_name = schema_path.stem + schemas[stream_name] = json.loads(schema_path.read_text()) return schemas From 5c65a622e48526a1628b348a058263ad6425b03d Mon Sep 17 00:00:00 2001 From: JingLin0 Date: Tue, 14 Apr 2020 16:36:26 +0200 Subject: [PATCH 52/78] cr: move session in init function --- tap_hubspot/hubspot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tap_hubspot/hubspot.py b/tap_hubspot/hubspot.py index 09c38dfb..85724483 100644 --- a/tap_hubspot/hubspot.py +++ b/tap_hubspot/hubspot.py @@ -13,7 +13,6 @@ class Hubspot: - SESSION = requests.Session() BASE_URL = "https://api.hubapi.com" ENDPOINTS = { "companies": "/companies/v2/companies/paged", @@ -44,6 +43,7 @@ class Hubspot: LIMIT = 250 def __init__(self, config, tap_stream_id, properties): + self.SESSION = requests.Session() self.access_token = None self.tap_stream_id = tap_stream_id self.config = config From e75a9a0671797cb3f7ef9e02199e7698aa3d33df Mon Sep 17 00:00:00 2001 From: JingLin0 Date: Tue, 14 Apr 2020 16:40:33 +0200 Subject: [PATCH 53/78] cr: move limit to init --- tap_hubspot/hubspot.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tap_hubspot/hubspot.py b/tap_hubspot/hubspot.py index 85724483..aadf8e6c 100644 --- a/tap_hubspot/hubspot.py +++ b/tap_hubspot/hubspot.py @@ -40,10 +40,10 @@ class Hubspot: "engagements": ["engagement", "lastUpdated"], "forms": ["updatedAt"], } - LIMIT = 250 - def __init__(self, config, tap_stream_id, properties): + def __init__(self, config, tap_stream_id, properties, limit=250): self.SESSION = requests.Session() + self.limit = limit self.access_token = None self.tap_stream_id = tap_stream_id self.config = config @@ -53,18 +53,18 @@ def __init__(self, config, tap_stream_id, properties): self.offset_key = None self.hasmore = True self.PARAMS = { - "companies": {"limit": self.LIMIT, "properties": properties,}, + "companies": {"limit": self.limit, "properties": properties,}, "contacts": { "showListMemberships": True, "includeVersion": True, - "count": self.LIMIT, + "count": self.limit, }, - "engagements": {"limit": self.LIMIT}, + "engagements": {"limit": self.limit}, "deals": { - "count": self.LIMIT, + "count": self.limit, "includeAssociations": False, "properties": properties, - "limit": self.LIMIT, + "limit": self.limit, }, } From c17e39de20a59c1a54a69d062d734c5c81e9498c Mon Sep 17 00:00:00 2001 From: JingLin0 Date: Tue, 14 Apr 2020 16:43:27 +0200 Subject: [PATCH 54/78] cr: use int directly --- tap_hubspot/hubspot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tap_hubspot/hubspot.py b/tap_hubspot/hubspot.py index aadf8e6c..50ae0132 100644 --- a/tap_hubspot/hubspot.py +++ b/tap_hubspot/hubspot.py @@ -90,7 +90,7 @@ def get_replication_value( def milliseconds_to_datetime(self, ms): return ( - datetime.datetime.fromtimestamp((int(ms) / 1000.0), datetime.timezone.utc) + datetime.datetime.fromtimestamp((int(ms) / 1000), datetime.timezone.utc) if ms else None ) From 9a2104696296a9743726bf988d4be57957d6f70b Mon Sep 17 00:00:00 2001 From: JingLin0 Date: Tue, 14 Apr 2020 16:45:06 +0200 Subject: [PATCH 55/78] cr: typehint for ms --- tap_hubspot/hubspot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tap_hubspot/hubspot.py b/tap_hubspot/hubspot.py index 50ae0132..80472e8e 100644 --- a/tap_hubspot/hubspot.py +++ b/tap_hubspot/hubspot.py @@ -88,7 +88,7 @@ def get_replication_value( return default return self.milliseconds_to_datetime(obj) - def milliseconds_to_datetime(self, ms): + def milliseconds_to_datetime(self, ms: str): return ( datetime.datetime.fromtimestamp((int(ms) / 1000), datetime.timezone.utc) if ms From 89c678c488b0e77f475f06dd04ba40c8936b2cc8 Mon Sep 17 00:00:00 2001 From: JingLin0 Date: Tue, 14 Apr 2020 17:00:31 +0200 Subject: [PATCH 56/78] cr: rewrite logic --- tap_hubspot/stream.py | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/tap_hubspot/stream.py b/tap_hubspot/stream.py index 0cb8bb8e..8ac20871 100644 --- a/tap_hubspot/stream.py +++ b/tap_hubspot/stream.py @@ -44,25 +44,24 @@ def do_sync(self, state): try: data = self.hubspot.streams(start_date, end_date) for d, replication_value in data: - if not replication_value: - record = transformer.transform(d, self.schema, self.mdata) - singer.write_record(self.tap_stream_id, record) - - elif (start_date >= replication_value) or ( - end_date <= replication_value + if replication_value and ( + start_date >= replication_value or end_date <= replication_value ): continue + + record = transformer.transform(d, self.schema, self.mdata) + singer.write_record(self.tap_stream_id, record) + if not replication_value: + continue + + new_bookmark = replication_value + if not prev_bookmark: + prev_bookmark = new_bookmark - else: - record = transformer.transform(d, self.schema, self.mdata) - singer.write_record(self.tap_stream_id, record) - new_bookmark = replication_value - if not prev_bookmark: - prev_bookmark = new_bookmark + if prev_bookmark < new_bookmark: + state = self.__advance_bookmark(state, prev_bookmark) + prev_bookmark = new_bookmark - if prev_bookmark < new_bookmark: - state = self.__advance_bookmark(state, prev_bookmark) - prev_bookmark = new_bookmark return self.__advance_bookmark(state, prev_bookmark) except Exception: From 84a5fd54e5b9bfdeb84b5433f6309bd4f0ce3190 Mon Sep 17 00:00:00 2001 From: JingLin0 Date: Wed, 15 Apr 2020 00:41:33 +0200 Subject: [PATCH 57/78] delete unused import --- tap_hubspot/__init__.py | 2 -- tap_hubspot/hubspot.py | 4 ---- tap_hubspot/stream.py | 1 - 3 files changed, 7 deletions(-) diff --git a/tap_hubspot/__init__.py b/tap_hubspot/__init__.py index 5f4b2e71..51037283 100644 --- a/tap_hubspot/__init__.py +++ b/tap_hubspot/__init__.py @@ -1,6 +1,4 @@ #!/usr/bin/env python3 -import os -import sys import json import singer from singer import utils, metadata, Catalog, CatalogEntry, Schema diff --git a/tap_hubspot/hubspot.py b/tap_hubspot/hubspot.py index 80472e8e..3068f261 100644 --- a/tap_hubspot/hubspot.py +++ b/tap_hubspot/hubspot.py @@ -1,13 +1,9 @@ import requests -from dateutil import parser -import time from ratelimit import limits import ratelimit import singer import backoff -import sys import datetime -from singer import utils LOGGER = singer.get_logger() diff --git a/tap_hubspot/stream.py b/tap_hubspot/stream.py index 8ac20871..154ee81d 100644 --- a/tap_hubspot/stream.py +++ b/tap_hubspot/stream.py @@ -4,7 +4,6 @@ CatalogEntry, Transformer, UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING, - _transform_datetime, ) from typing import Union from datetime import timedelta, datetime From 0d1c816ffac05689b4841e49d5ab7888046017eb Mon Sep 17 00:00:00 2001 From: JingLin0 Date: Wed, 15 Apr 2020 00:44:02 +0200 Subject: [PATCH 58/78] cr: add a function for each endpoint and use pagination --- tap_hubspot/hubspot.py | 250 +++++++++++++++++++++++++---------------- tap_hubspot/stream.py | 6 +- 2 files changed, 157 insertions(+), 99 deletions(-) diff --git a/tap_hubspot/hubspot.py b/tap_hubspot/hubspot.py index 3068f261..cfebe6b3 100644 --- a/tap_hubspot/hubspot.py +++ b/tap_hubspot/hubspot.py @@ -4,74 +4,143 @@ import singer import backoff import datetime +from typing import Dict LOGGER = singer.get_logger() class Hubspot: BASE_URL = "https://api.hubapi.com" - ENDPOINTS = { - "companies": "/companies/v2/companies/paged", - "contacts": "/contacts/v1/lists/all/contacts/all", - "deal_pipelines": "/crm-pipelines/v1/pipelines/deals", - "deals": "/deals/v1/deal/paged", - "email_events": "/email/public/v1/events", - "engagements": "/engagements/v1/engagements/paged", - "forms": "/forms/v2/forms", - } - DATA_PATH = { - "companies": "companies", - "contacts": "contacts", - "deal_pipelines": "results", - "deals": "deals", - "email_events": "events", - "engagements": "results", - } - REPLICATION_PATH = { - "companies": ["properties", "hs_lastmodifieddate", "timestamp",], - "contacts": ["properties", "lastmodifieddate", "value"], - "deal_pipelines": ["updatedAt"], - "deals": ["properties", "hs_lastmodifieddate", "timestamp"], - "email_events": ["created"], - "engagements": ["engagement", "lastUpdated"], - "forms": ["updatedAt"], - } - - def __init__(self, config, tap_stream_id, properties, limit=250): + + def __init__(self, config, limit=250): self.SESSION = requests.Session() self.limit = limit self.access_token = None - self.tap_stream_id = tap_stream_id self.config = config self.refresh_access_token() - self.endpoint = self.ENDPOINTS[tap_stream_id] - self.offset_value = None - self.offset_key = None - self.hasmore = True - self.PARAMS = { - "companies": {"limit": self.limit, "properties": properties,}, - "contacts": { - "showListMemberships": True, - "includeVersion": True, - "count": self.limit, - }, - "engagements": {"limit": self.limit}, - "deals": { - "count": self.limit, - "includeAssociations": False, - "properties": properties, - "limit": self.limit, - }, + + def streams(self, tap_stream_id, start_date, end_date, properties): + if tap_stream_id == "companies": + yield from self.get_companies(properties) + elif tap_stream_id == "contacts": + yield from self.get_contacts() + elif tap_stream_id == "engagements": + yield from self.get_engagements() + elif tap_stream_id == "deal_pipelines": + yield from self.get_deal_pipelines() + elif tap_stream_id == "deals": + yield from self.get_deals(properties) + elif tap_stream_id == "email_events": + start_date = self.datetime_to_milliseconds(start_date) + end_date = self.datetime_to_milliseconds(end_date) + yield from self.get_email_events(start_date, end_date) + elif tap_stream_id == "forms": + yield from self.get_forms() + else: + return [] + + def get_companies(self, properties): + path = "/contacts/v1/lists/all/contacts/all" + data_field = "companies" + replication_path = ["properties", "hs_lastmodifieddate", "timestamp"] + params = { + "limit": self.limit, + "properties": properties, + } + offset_key = "offset" + yield from self.get_records( + path, + replication_path, + params=params, + data_field=data_field, + offset_key=offset_key, + ) + + def get_contacts(self): + path = "/contacts/v1/lists/all/contacts/all" + data_field = "contacts" + replication_path = ["properties", "lastmodifieddate", "value"] + params = { + "showListMemberships": True, + "includeVersion": True, + "count": self.limit, } + offset_key = "vid-offset" + yield from self.get_records( + path, + replication_path, + params=params, + data_field=data_field, + offset_key=offset_key, + ) + + def get_engagements(self): + path = "/engagements/v1/engagements/paged" + data_field = "results" + replication_path = ["engagement", "lastUpdated"] + params = {"limit": self.limit} + offset_key = "offset" + yield from self.get_records( + path, + replication_path, + params=params, + data_field=data_field, + offset_key=offset_key, + ) + + def get_deal_pipelines(self): + path = "/crm-pipelines/v1/pipelines/deals" + data_field = "results" + replication_path = ["updatedAt"] + yield from self.get_records(path, replication_path, data_field=data_field) + + def get_deals(self, properties): + path = "/deals/v1/deal/paged" + data_field = "deals" + replication_path = ["properties", "hs_lastmodifieddate", "timestamp"] + params = { + "count": self.limit, + "includeAssociations": False, + "properties": properties, + "limit": self.limit, + } + offset_key = "offset" + yield from self.get_records( + path, + replication_path, + params=params, + data_field=data_field, + offset_key=offset_key, + ) - def get_url_params(self, start_date, end_date): - url = f"{self.BASE_URL}{self.endpoint}" - params = self.PARAMS.get(self.tap_stream_id, {}) - if self.tap_stream_id == "email_events": - params = {"startTimestamp": start_date, "endTimestamp": end_date} - if self.offset_value: - params[self.offset_key] = self.offset_value - return url, params + def get_email_events(self, start_date, end_date): + path = "/email/public/v1/events" + data_field = "events" + replication_path = ["created"] + params = {"startTimestamp": start_date, "endTimestamp": end_date} + offset_key = "offset" + + yield from self.get_records( + path, + replication_path, + params=params, + data_field=data_field, + offset_key=offset_key, + ) + + def get_forms(self): + path = "/forms/v2/forms" + replication_path = ["updatedAt"] + yield from self.get_records(path, replication_path) + + def get_records( + self, path, replication_path, params={}, data_field=None, offset_key=None + ): + for record in self.paginate( + path, params=params, data_field=data_field, offset_key=offset_key, + ): + replication_value = self.get_replication_value(record, replication_path) + yield record, replication_value def get_replication_value( self, obj: dict, path_to_replication_key=None, default=None @@ -94,29 +163,34 @@ def milliseconds_to_datetime(self, ms: str): def datetime_to_milliseconds(self, d: datetime.datetime): return int(d.timestamp() * 1000) if d else None - def get_records(self, start_date, end_date): - while self.hasmore: - url, params = self.get_url_params(start_date, end_date) - records = self.call_api(url, params=params) - if records: - replication_value = map( - lambda record: self.get_replication_value( - obj=record, - path_to_replication_key=self.REPLICATION_PATH.get( - self.tap_stream_id - ), - ), - records, - ) - yield from zip(records, replication_value) + def paginate( + self, path: str, params: Dict = None, data_field: str = None, offset_key=None + ): + offset_value = None + while True: + if offset_value: + if offset_key == "vid-offset": + params["vidOffset"] = offset_value + else: + params[offset_key] = offset_value + + data = self.call_api(path, params=params) + + if not data_field: + # non paginated list + yield from data + return else: + d = data.get(data_field, []) + yield from d + if not d: + return + + if offset_key: + offset_value = data.get(offset_key) + if not offset_value: break - def streams(self, start_date, end_date): - start_date = self.datetime_to_milliseconds(start_date) - end_date = self.datetime_to_milliseconds(end_date) - yield from self.get_records(start_date, end_date) - @backoff.on_exception( backoff.expo, ( @@ -128,31 +202,13 @@ def streams(self, start_date, end_date): @limits(calls=100, period=10) def call_api(self, url, params={}): response = self.SESSION.get( - url, headers={"Authorization": f"Bearer {self.access_token}"}, params=params + f"{self.BASE_URL}{url}", + headers={"Authorization": f"Bearer {self.access_token}"}, + params=params, ) LOGGER.info(response.url) response.raise_for_status() - data = self.get_offset(response.json()) - - return data - - def get_offset(self, data): - data_path = self.DATA_PATH.get(self.tap_stream_id) - if isinstance(data, list): - self.hasmore = False - return data - - if self.tap_stream_id == "deal_pipelines": - self.hasmore = False - - offset = [k for k in data.keys() if k.endswith("offset")] - if offset: - offset = offset[0] - self.offset_value = data.get(offset) - self.offset_key = "vidOffset" if offset == "vid-offset" else "offset" - data = data[data_path] if data_path else data - - return data + return response.json() def refresh_access_token(self): payload = { diff --git a/tap_hubspot/stream.py b/tap_hubspot/stream.py index 154ee81d..3935c2d9 100644 --- a/tap_hubspot/stream.py +++ b/tap_hubspot/stream.py @@ -22,7 +22,7 @@ def __init__(self, catalog: CatalogEntry, config): self.mdata = metadata.to_map(catalog.metadata) self.bookmark_key = self.mdata.get(()).get("valid-replication-keys")[0] self.config = config - self.hubspot = Hubspot(config, self.tap_stream_id, self.get_properties()) + self.hubspot = Hubspot(config) def get_properties(self): properties = [] @@ -41,7 +41,9 @@ def do_sync(self, state): start_date, end_date = self.__get_start_end(state) with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as transformer: try: - data = self.hubspot.streams(start_date, end_date) + data = self.hubspot.streams( + self.tap_stream_id, start_date, end_date, self.get_properties() + ) for d, replication_value in data: if replication_value and ( start_date >= replication_value or end_date <= replication_value From f2809ba88a6aea452f458f210f4c8fe888fa4d38 Mon Sep 17 00:00:00 2001 From: JingLin0 Date: Wed, 15 Apr 2020 10:31:21 +0200 Subject: [PATCH 59/78] cr: raise error if the function is not implemented --- tap_hubspot/hubspot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tap_hubspot/hubspot.py b/tap_hubspot/hubspot.py index cfebe6b3..4e94944a 100644 --- a/tap_hubspot/hubspot.py +++ b/tap_hubspot/hubspot.py @@ -37,7 +37,7 @@ def streams(self, tap_stream_id, start_date, end_date, properties): elif tap_stream_id == "forms": yield from self.get_forms() else: - return [] + raise NotImplementedError(f"unknown stream_id: {tap_stream_id}") def get_companies(self, properties): path = "/contacts/v1/lists/all/contacts/all" From 7779e8c8af7cb26d23a56f483046746bb36f9933 Mon Sep 17 00:00:00 2001 From: JingLin0 Date: Wed, 15 Apr 2020 10:52:31 +0200 Subject: [PATCH 60/78] update replication key --- tap_hubspot/__init__.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tap_hubspot/__init__.py b/tap_hubspot/__init__.py index 51037283..273ebf5e 100644 --- a/tap_hubspot/__init__.py +++ b/tap_hubspot/__init__.py @@ -7,13 +7,10 @@ KEY_PROPERTIES = "id" STREAMS = { - "email_events": { - "valid_replication_keys": ["startTimestamp"], - "key_properties": "id", - }, + "email_events": {"valid_replication_keys": ["created"], "key_properties": "id",}, "forms": {"valid_replication_keys": ["updatedAt"], "key_properties": "guid",}, "contacts": { - "valid_replication_keys": ["versionTimestamp"], + "valid_replication_keys": ["lastmodifieddate"], "key_properties": "vid", }, "companies": { From 0a8e9c94d42c54ecd5337b81b23e5ea087457606 Mon Sep 17 00:00:00 2001 From: JingLin0 Date: Wed, 15 Apr 2020 12:04:13 +0200 Subject: [PATCH 61/78] fix up companies endpoint --- tap_hubspot/hubspot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tap_hubspot/hubspot.py b/tap_hubspot/hubspot.py index 4e94944a..813f7150 100644 --- a/tap_hubspot/hubspot.py +++ b/tap_hubspot/hubspot.py @@ -40,7 +40,7 @@ def streams(self, tap_stream_id, start_date, end_date, properties): raise NotImplementedError(f"unknown stream_id: {tap_stream_id}") def get_companies(self, properties): - path = "/contacts/v1/lists/all/contacts/all" + path = "/companies/v2/companies/paged" data_field = "companies" replication_path = ["properties", "hs_lastmodifieddate", "timestamp"] params = { From 4c33da442806e0a4cdfc2182946d450fc7750b0b Mon Sep 17 00:00:00 2001 From: JingLin0 Date: Thu, 16 Apr 2020 09:25:50 +0200 Subject: [PATCH 62/78] include associations --- tap_hubspot/hubspot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tap_hubspot/hubspot.py b/tap_hubspot/hubspot.py index 813f7150..48d18450 100644 --- a/tap_hubspot/hubspot.py +++ b/tap_hubspot/hubspot.py @@ -100,7 +100,7 @@ def get_deals(self, properties): replication_path = ["properties", "hs_lastmodifieddate", "timestamp"] params = { "count": self.limit, - "includeAssociations": False, + "includeAssociations": True, "properties": properties, "limit": self.limit, } From ff7f71f437c2e26a0490623ace900294d6fd199a Mon Sep 17 00:00:00 2001 From: JingLin0 Date: Thu, 23 Apr 2020 12:15:20 +0200 Subject: [PATCH 63/78] fix wrong key properties in schema --- tap_hubspot/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tap_hubspot/__init__.py b/tap_hubspot/__init__.py index 273ebf5e..3e721a65 100644 --- a/tap_hubspot/__init__.py +++ b/tap_hubspot/__init__.py @@ -5,7 +5,6 @@ from tap_hubspot.stream import Stream from pathlib import Path -KEY_PROPERTIES = "id" STREAMS = { "email_events": {"valid_replication_keys": ["created"], "key_properties": "id",}, "forms": {"valid_replication_keys": ["updatedAt"], "key_properties": "guid",}, @@ -55,17 +54,18 @@ def discover() -> Catalog: streams = [] for tap_stream_id, props in STREAMS.items(): + key_properties = props.get("key_properties", None) schema = schemas[tap_stream_id] mdata = metadata.get_standard_metadata( schema=schema, - key_properties=props.get("key_properties", None), + key_properties=key_properties, valid_replication_keys=props.get("valid_replication_keys", []), ) streams.append( CatalogEntry( stream=tap_stream_id, tap_stream_id=tap_stream_id, - key_properties=KEY_PROPERTIES, + key_properties=key_properties, schema=Schema.from_dict(schema), metadata=mdata, ) From 291b42ad9ba31955ffbbed3a50d4f3c910bc2ad6 Mon Sep 17 00:00:00 2001 From: JingLin0 Date: Thu, 23 Apr 2020 12:15:44 +0200 Subject: [PATCH 64/78] add submissions stream --- tap_hubspot/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tap_hubspot/__init__.py b/tap_hubspot/__init__.py index 3e721a65..e8db14d1 100644 --- a/tap_hubspot/__init__.py +++ b/tap_hubspot/__init__.py @@ -28,6 +28,7 @@ "valid_replication_keys": ["lastUpdated"], "key_properties": "engagement_id", }, + "submissions": {"valid_replication_keys": ["submittedAt"], "key_properties": []}, } REQUIRED_CONFIG_KEYS = [ "start_date", From bf29a89318a31b428df580b9e60fe017d56a420d Mon Sep 17 00:00:00 2001 From: JingLin0 Date: Thu, 23 Apr 2020 12:18:28 +0200 Subject: [PATCH 65/78] add submissions schema --- tap_hubspot/schemas/submissions.json | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 tap_hubspot/schemas/submissions.json diff --git a/tap_hubspot/schemas/submissions.json b/tap_hubspot/schemas/submissions.json new file mode 100644 index 00000000..8324e5bb --- /dev/null +++ b/tap_hubspot/schemas/submissions.json @@ -0,0 +1,26 @@ +{ + "type": "object", + "properties": { + "submittedAt": { + "type": ["null", "string"], + "format": "date-time" + }, + "values": { + "type": ["null", "array"], + "items": { + "type": ["null", "object"], + "properties": { + "name": { + "type": ["null", "string"] + }, + "value": { + "type": ["null", "string"] + } + } + } + }, + "pageUrl": { + "type": ["null", "string"] + } + } +} From 51a19267e0819cc5f8a17543679a9c72602c6398 Mon Sep 17 00:00:00 2001 From: JingLin0 Date: Thu, 23 Apr 2020 12:21:28 +0200 Subject: [PATCH 66/78] get submissions data --- tap_hubspot/hubspot.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tap_hubspot/hubspot.py b/tap_hubspot/hubspot.py index 48d18450..152cfc7a 100644 --- a/tap_hubspot/hubspot.py +++ b/tap_hubspot/hubspot.py @@ -36,6 +36,8 @@ def streams(self, tap_stream_id, start_date, end_date, properties): yield from self.get_email_events(start_date, end_date) elif tap_stream_id == "forms": yield from self.get_forms() + elif tap_stream_id == "submissions": + yield from self.get_submissions() else: raise NotImplementedError(f"unknown stream_id: {tap_stream_id}") @@ -133,6 +135,24 @@ def get_forms(self): replication_path = ["updatedAt"] yield from self.get_records(path, replication_path) + def get_submissions(self): + # submission data is retrieved according to guid from forms + replication_path = ["submittedAt"] + data_field = "results" + offset_key = "after" + params = {"limit": 50} # maxmimum limit is 50 + forms = self.get_forms() + for form, _ in forms: + guid = form["guid"] + path = f"/form-integrations/v1/submissions/forms/{guid}" + yield from self.get_records( + path, + replication_path, + params=params, + data_field=data_field, + offset_key=offset_key, + ) + def get_records( self, path, replication_path, params={}, data_field=None, offset_key=None ): From 2dd374cbbaef806a969613ca500033e7861ed9e3 Mon Sep 17 00:00:00 2001 From: JingLin0 Date: Thu, 23 Apr 2020 12:22:02 +0200 Subject: [PATCH 67/78] submission pagination and reuse get_replication_value func --- tap_hubspot/hubspot.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/tap_hubspot/hubspot.py b/tap_hubspot/hubspot.py index 152cfc7a..65009bac 100644 --- a/tap_hubspot/hubspot.py +++ b/tap_hubspot/hubspot.py @@ -159,19 +159,19 @@ def get_records( for record in self.paginate( path, params=params, data_field=data_field, offset_key=offset_key, ): - replication_value = self.get_replication_value(record, replication_path) + replication_value = self.milliseconds_to_datetime( + self.get_value(record, replication_path) + ) yield record, replication_value - def get_replication_value( - self, obj: dict, path_to_replication_key=None, default=None - ): + def get_value(self, obj: dict, path_to_replication_key=None, default=None): if not path_to_replication_key: return default for path_element in path_to_replication_key: obj = obj.get(path_element) if not obj: return default - return self.milliseconds_to_datetime(obj) + return obj def milliseconds_to_datetime(self, ms: str): return ( @@ -207,7 +207,10 @@ def paginate( return if offset_key: - offset_value = data.get(offset_key) + if "paging" in data: + offset_value = self.get_value(data, ["paging", "next", "after"]) + else: + offset_value = data.get(offset_key) if not offset_value: break From 448baed836b665568a8ff88594d95494709e51cf Mon Sep 17 00:00:00 2001 From: JingLin0 Date: Sat, 25 Apr 2020 20:26:18 +0200 Subject: [PATCH 68/78] transform dash to underscore for schema and record --- tap_hubspot/__init__.py | 3 +- tap_hubspot/hubspot.py | 2 ++ tap_hubspot/util.py | 62 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 66 insertions(+), 1 deletion(-) create mode 100644 tap_hubspot/util.py diff --git a/tap_hubspot/__init__.py b/tap_hubspot/__init__.py index e8db14d1..1b8ea9ea 100644 --- a/tap_hubspot/__init__.py +++ b/tap_hubspot/__init__.py @@ -4,6 +4,7 @@ from singer import utils, metadata, Catalog, CatalogEntry, Schema from tap_hubspot.stream import Stream from pathlib import Path +from tap_hubspot.util import schema_nodash STREAMS = { "email_events": {"valid_replication_keys": ["created"], "key_properties": "id",}, @@ -45,7 +46,7 @@ def load_schemas(): schemas_path = Path(__file__).parent.absolute() / "schemas" for schema_path in schemas_path.iterdir(): stream_name = schema_path.stem - schemas[stream_name] = json.loads(schema_path.read_text()) + schemas[stream_name] = schema_nodash(json.loads(schema_path.read_text())) return schemas diff --git a/tap_hubspot/hubspot.py b/tap_hubspot/hubspot.py index 65009bac..1bb30f38 100644 --- a/tap_hubspot/hubspot.py +++ b/tap_hubspot/hubspot.py @@ -5,6 +5,7 @@ import backoff import datetime from typing import Dict +from tap_hubspot.util import record_nodash LOGGER = singer.get_logger() @@ -159,6 +160,7 @@ def get_records( for record in self.paginate( path, params=params, data_field=data_field, offset_key=offset_key, ): + record = record_nodash(record) replication_value = self.milliseconds_to_datetime( self.get_value(record, replication_path) ) diff --git a/tap_hubspot/util.py b/tap_hubspot/util.py new file mode 100644 index 00000000..2e3a2396 --- /dev/null +++ b/tap_hubspot/util.py @@ -0,0 +1,62 @@ +import copy + + +def record_nodash(obj): + transformed_obj = copy.deepcopy(obj) + + if not isinstance(obj, (dict, list)): + return obj + if isinstance(obj, dict): + for key in obj: + value = record_nodash(obj[key]) + transformed_obj.pop(key) + key = key.replace("-", "_") + transformed_obj[key] = value + if isinstance(obj, list): + for i in range(len(obj)): + value = record_nodash(obj[i]) + transformed_obj[i] = value + return transformed_obj + + +def schema_nodash(obj): + type_field = obj.get("type") + type = get_type(type_field) + if not type: + return obj + if not type in ["array", "object"]: + return obj + if "object" == type: + props = obj.get("properties", {}) + new_props = replace_props(props) + obj["properties"] = new_props + if "array" == type: + items = obj.get("items", {}) + obj["items"] = schema_nodash(items) + return obj + + +def get_type(type_field): + if isinstance(type_field, str): + return type_field + if isinstance(type_field, list): + types = set(type_field) + if "null" in types: + types.remove("null") + return types.pop() + return None + + +def replace_props(props): + if not props: + return props + keys = list(props.keys()) + for k in keys: + if not "-" in k: + props[k] = schema_nodash(props[k]) + else: + v = props.pop(k) + new_key = k.replace("-", "_") + new_value = schema_nodash(v) + props[new_key] = new_value + return props From 50e8e3797c0e6cf691c58e7b6d66e9c7fd0bc717 Mon Sep 17 00:00:00 2001 From: JingLin0 Date: Sat, 25 Apr 2020 20:26:44 +0200 Subject: [PATCH 69/78] only use record_nodash when necessary --- tap_hubspot/hubspot.py | 24 +++++++++++++----------- tap_hubspot/stream.py | 8 +++----- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/tap_hubspot/hubspot.py b/tap_hubspot/hubspot.py index 1bb30f38..d5fd4a14 100644 --- a/tap_hubspot/hubspot.py +++ b/tap_hubspot/hubspot.py @@ -13,34 +13,35 @@ class Hubspot: BASE_URL = "https://api.hubapi.com" - def __init__(self, config, limit=250): + def __init__(self, config, tap_stream_id, limit=250): self.SESSION = requests.Session() self.limit = limit self.access_token = None self.config = config self.refresh_access_token() + self.tap_stream_id = tap_stream_id - def streams(self, tap_stream_id, start_date, end_date, properties): - if tap_stream_id == "companies": + def streams(self, start_date, end_date, properties): + if self.tap_stream_id == "companies": yield from self.get_companies(properties) - elif tap_stream_id == "contacts": + elif self.tap_stream_id == "contacts": yield from self.get_contacts() - elif tap_stream_id == "engagements": + elif self.tap_stream_id == "engagements": yield from self.get_engagements() - elif tap_stream_id == "deal_pipelines": + elif self.tap_stream_id == "deal_pipelines": yield from self.get_deal_pipelines() - elif tap_stream_id == "deals": + elif self.tap_stream_id == "deals": yield from self.get_deals(properties) - elif tap_stream_id == "email_events": + elif self.tap_stream_id == "email_events": start_date = self.datetime_to_milliseconds(start_date) end_date = self.datetime_to_milliseconds(end_date) yield from self.get_email_events(start_date, end_date) - elif tap_stream_id == "forms": + elif self.tap_stream_id == "forms": yield from self.get_forms() - elif tap_stream_id == "submissions": + elif self.tap_stream_id == "submissions": yield from self.get_submissions() else: - raise NotImplementedError(f"unknown stream_id: {tap_stream_id}") + raise NotImplementedError(f"unknown stream_id: {self.tap_stream_id}") def get_companies(self, properties): path = "/companies/v2/companies/paged" @@ -160,6 +161,7 @@ def get_records( for record in self.paginate( path, params=params, data_field=data_field, offset_key=offset_key, ): + if self.tap_stream_id == "contacts": record = record_nodash(record) replication_value = self.milliseconds_to_datetime( self.get_value(record, replication_path) diff --git a/tap_hubspot/stream.py b/tap_hubspot/stream.py index 3935c2d9..176e4484 100644 --- a/tap_hubspot/stream.py +++ b/tap_hubspot/stream.py @@ -22,7 +22,7 @@ def __init__(self, catalog: CatalogEntry, config): self.mdata = metadata.to_map(catalog.metadata) self.bookmark_key = self.mdata.get(()).get("valid-replication-keys")[0] self.config = config - self.hubspot = Hubspot(config) + self.hubspot = Hubspot(config, self.tap_stream_id) def get_properties(self): properties = [] @@ -41,15 +41,13 @@ def do_sync(self, state): start_date, end_date = self.__get_start_end(state) with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as transformer: try: - data = self.hubspot.streams( - self.tap_stream_id, start_date, end_date, self.get_properties() - ) + data = self.hubspot.streams(start_date, end_date, self.get_properties()) for d, replication_value in data: if replication_value and ( start_date >= replication_value or end_date <= replication_value ): continue - + record = transformer.transform(d, self.schema, self.mdata) singer.write_record(self.tap_stream_id, record) if not replication_value: From be6a95e5d4a9fb91b210df53ee413e74f46f72a2 Mon Sep 17 00:00:00 2001 From: JingLin0 Date: Sat, 25 Apr 2020 20:26:58 +0200 Subject: [PATCH 70/78] update contacts schema --- tap_hubspot/schemas/contacts.json | 116 ++++++++++++++++++++++++++---- 1 file changed, 104 insertions(+), 12 deletions(-) diff --git a/tap_hubspot/schemas/contacts.json b/tap_hubspot/schemas/contacts.json index 54e4d3d2..0a6812c2 100644 --- a/tap_hubspot/schemas/contacts.json +++ b/tap_hubspot/schemas/contacts.json @@ -1,22 +1,41 @@ { "type": "object", "properties": { + "addedAt": { + "type": ["null", "string"], + "format": "date-time" + }, "vid": { "type": ["null", "integer"] }, + "canonical-vid": { + "type": ["null", "integer"] + }, + "portal-id": { + "type": ["null", "integer"] + }, + "is-contact": { + "type": ["null", "boolean"] + }, + "profile-token": { + "type": ["null", "string"] + }, + "profile-url": { + "type": ["null", "string"] + }, "properties": { - "type": "object", + "type": ["null", "object"], "properties": { - "email": { - "type": "object", + "firstname": { + "type": ["null", "object"], "properties": { "value": { "type": ["null", "string"] } } }, - "createdate": { - "type": "object", + "lastmodifieddate": { + "type": ["null", "object"], "properties": { "value": { "type": ["null", "string"], @@ -24,20 +43,93 @@ } } }, - "lastmodifieddate": { - "type": "object", + "company": { + "type": ["null", "object"], "properties": { "value": { - "type": ["null", "string"], - "format": "date-time" + "type": "string" } } }, - "associatedcompanyid": { - "type": "object", + "lastname": { + "type": ["null", "object"], "properties": { "value": { - "type": ["null", "number"] + "type": ["null", "string"] + } + } + } + } + }, + "form-submissions": { + "type": ["null", "array"], + + "items": { + "type": ["null", "object"], + "properties": { + "conversion-id": { + "type": ["null", "string"] + }, + "timestamp": { + "type": ["null", "string"], + "format": "date-time" + }, + "form-id": { + "type": ["null", "string"] + }, + "portal-id": { + "type": ["null", "integer"] + }, + "title": { + "type": ["null", "string"] + }, + "form-type": { + "type": ["null", "string"] + }, + "contact-associated-by": { + "type": "array", + "items": { + "type": ["null", "string"] + } + } + } + } + }, + "identity-profiles": { + "type": ["null", "array"], + "items": { + "type": ["null", "object"], + "properties": { + "vid": { + "type": ["null", "integer"] + }, + "saved-at-timestamp": { + "type": ["null", "string"], + "format": "date-time" + }, + "deleted-changed-timestamp": { + "type": ["null", "string"], + "format": "date-time" + }, + "identities": { + "type": ["null", "array"], + "items": { + "type": ["null", "object"], + "properties": { + "type": { + "type": ["null", "string"] + }, + "value": { + "type": ["null", "string"] + }, + "timestamp": { + "type": ["null", "string"], + "format": "date-time" + }, + "is-primary": { + "type": ["null", "boolean"] + } + } } } } From c2ab88c40dbb657a6c979fe4205c688df8338aa1 Mon Sep 17 00:00:00 2001 From: JingLin0 Date: Sun, 26 Apr 2020 21:26:40 +0200 Subject: [PATCH 71/78] get companyId and dealId according to contact vids --- tap_hubspot/hubspot.py | 18 +++++++++++++++++- tap_hubspot/stream.py | 2 ++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/tap_hubspot/hubspot.py b/tap_hubspot/hubspot.py index d5fd4a14..f90ec014 100644 --- a/tap_hubspot/hubspot.py +++ b/tap_hubspot/hubspot.py @@ -12,6 +12,7 @@ class Hubspot: BASE_URL = "https://api.hubapi.com" + CONTACT_DEFINITION_IDS = {"companyId": 1, "dealId": 4} def __init__(self, config, tap_stream_id, limit=250): self.SESSION = requests.Session() @@ -78,6 +79,21 @@ def get_contacts(self): offset_key=offset_key, ) + def get_association(self, vid, definition_id): + path = ( + f"/crm-associations/v1/associations/{vid}/HUBSPOT_DEFINED/{definition_id}" + ) + record = self.call_api(url=path)["results"] + if record: + return int(record[0]) + else: + return None + + def set_associations(self, record): + for association, definition_id in self.CONTACT_DEFINITION_IDS.items(): + record[association] = self.get_association(record["vid"], definition_id) + return record + def get_engagements(self): path = "/engagements/v1/engagements/paged" data_field = "results" @@ -156,7 +172,7 @@ def get_submissions(self): ) def get_records( - self, path, replication_path, params={}, data_field=None, offset_key=None + self, path, replication_path=None, params={}, data_field=None, offset_key=None ): for record in self.paginate( path, params=params, data_field=data_field, offset_key=offset_key, diff --git a/tap_hubspot/stream.py b/tap_hubspot/stream.py index 176e4484..51b19635 100644 --- a/tap_hubspot/stream.py +++ b/tap_hubspot/stream.py @@ -47,6 +47,8 @@ def do_sync(self, state): start_date >= replication_value or end_date <= replication_value ): continue + if self.tap_stream_id == "contacts": + d = self.hubspot.set_associations(d) record = transformer.transform(d, self.schema, self.mdata) singer.write_record(self.tap_stream_id, record) From 881441173291679ecd2cfc6daad6bdcef52bfbdd Mon Sep 17 00:00:00 2001 From: JingLin0 Date: Sun, 26 Apr 2020 21:27:15 +0200 Subject: [PATCH 72/78] add companyId and dealId in contact schema --- tap_hubspot/schemas/contacts.json | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tap_hubspot/schemas/contacts.json b/tap_hubspot/schemas/contacts.json index 0a6812c2..64a95210 100644 --- a/tap_hubspot/schemas/contacts.json +++ b/tap_hubspot/schemas/contacts.json @@ -8,6 +8,12 @@ "vid": { "type": ["null", "integer"] }, + "companyId": { + "type": ["null", "integer"] + }, + "dealId": { + "type": ["null", "integer"] + }, "canonical-vid": { "type": ["null", "integer"] }, From 9a77e47247eccb65b5c03c4fecaecb7f7138ade6 Mon Sep 17 00:00:00 2001 From: JingLin0 Date: Tue, 28 Apr 2020 00:43:23 +0200 Subject: [PATCH 73/78] do not sync dealId --- tap_hubspot/hubspot.py | 2 +- tap_hubspot/schemas/contacts.json | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/tap_hubspot/hubspot.py b/tap_hubspot/hubspot.py index f90ec014..914fbddc 100644 --- a/tap_hubspot/hubspot.py +++ b/tap_hubspot/hubspot.py @@ -12,7 +12,7 @@ class Hubspot: BASE_URL = "https://api.hubapi.com" - CONTACT_DEFINITION_IDS = {"companyId": 1, "dealId": 4} + CONTACT_DEFINITION_IDS = {"companyId": 1} def __init__(self, config, tap_stream_id, limit=250): self.SESSION = requests.Session() diff --git a/tap_hubspot/schemas/contacts.json b/tap_hubspot/schemas/contacts.json index 64a95210..6720599f 100644 --- a/tap_hubspot/schemas/contacts.json +++ b/tap_hubspot/schemas/contacts.json @@ -11,9 +11,6 @@ "companyId": { "type": ["null", "integer"] }, - "dealId": { - "type": ["null", "integer"] - }, "canonical-vid": { "type": ["null", "integer"] }, From 64c3cfda4a73b711beb3e0067078fa560ec36b6a Mon Sep 17 00:00:00 2001 From: JingLin0 Date: Tue, 28 Apr 2020 15:19:57 +0200 Subject: [PATCH 74/78] use v3 contacts api --- tap_hubspot/__init__.py | 5 +- tap_hubspot/hubspot.py | 36 +++---- tap_hubspot/schemas/contacts.json | 165 ++++++++---------------------- tap_hubspot/stream.py | 2 - 4 files changed, 59 insertions(+), 149 deletions(-) diff --git a/tap_hubspot/__init__.py b/tap_hubspot/__init__.py index 1b8ea9ea..cf939313 100644 --- a/tap_hubspot/__init__.py +++ b/tap_hubspot/__init__.py @@ -9,10 +9,7 @@ STREAMS = { "email_events": {"valid_replication_keys": ["created"], "key_properties": "id",}, "forms": {"valid_replication_keys": ["updatedAt"], "key_properties": "guid",}, - "contacts": { - "valid_replication_keys": ["lastmodifieddate"], - "key_properties": "vid", - }, + "contacts": {"valid_replication_keys": ["updatedAt"], "key_properties": "id",}, "companies": { "valid_replication_keys": ["hs_lastmodifieddate"], "key_properties": "companyId", diff --git a/tap_hubspot/hubspot.py b/tap_hubspot/hubspot.py index 914fbddc..2eeea30e 100644 --- a/tap_hubspot/hubspot.py +++ b/tap_hubspot/hubspot.py @@ -6,6 +6,7 @@ import datetime from typing import Dict from tap_hubspot.util import record_nodash +from dateutil import parser LOGGER = singer.get_logger() @@ -26,7 +27,7 @@ def streams(self, start_date, end_date, properties): if self.tap_stream_id == "companies": yield from self.get_companies(properties) elif self.tap_stream_id == "contacts": - yield from self.get_contacts() + yield from self.get_contacts(properties) elif self.tap_stream_id == "engagements": yield from self.get_engagements() elif self.tap_stream_id == "deal_pipelines": @@ -61,16 +62,15 @@ def get_companies(self, properties): offset_key=offset_key, ) - def get_contacts(self): - path = "/contacts/v1/lists/all/contacts/all" - data_field = "contacts" - replication_path = ["properties", "lastmodifieddate", "value"] + def get_contacts(self, properties): + path = "/crm/v3/objects/contacts" + data_field = "results" + offset_key = "after" + replication_path = ["updatedAt"] params = { - "showListMemberships": True, - "includeVersion": True, - "count": self.limit, + "limit": 100, + "properties": properties, } - offset_key = "vid-offset" yield from self.get_records( path, replication_path, @@ -177,11 +177,14 @@ def get_records( for record in self.paginate( path, params=params, data_field=data_field, offset_key=offset_key, ): - if self.tap_stream_id == "contacts": - record = record_nodash(record) - replication_value = self.milliseconds_to_datetime( - self.get_value(record, replication_path) - ) + if self.tap_stream_id in ["contacts"]: + replication_value = parser.isoparse( + self.get_value(record, replication_path) + ) + else: + replication_value = self.milliseconds_to_datetime( + self.get_value(record, replication_path) + ) yield record, replication_value def get_value(self, obj: dict, path_to_replication_key=None, default=None): @@ -209,10 +212,7 @@ def paginate( offset_value = None while True: if offset_value: - if offset_key == "vid-offset": - params["vidOffset"] = offset_value - else: - params[offset_key] = offset_value + params[offset_key] = offset_value data = self.call_api(path, params=params) diff --git a/tap_hubspot/schemas/contacts.json b/tap_hubspot/schemas/contacts.json index 6720599f..55fd0b58 100644 --- a/tap_hubspot/schemas/contacts.json +++ b/tap_hubspot/schemas/contacts.json @@ -1,142 +1,57 @@ { "type": "object", "properties": { - "addedAt": { - "type": ["null", "string"], - "format": "date-time" - }, - "vid": { - "type": ["null", "integer"] - }, - "companyId": { - "type": ["null", "integer"] - }, - "canonical-vid": { - "type": ["null", "integer"] - }, - "portal-id": { - "type": ["null", "integer"] - }, - "is-contact": { - "type": ["null", "boolean"] - }, - "profile-token": { - "type": ["null", "string"] - }, - "profile-url": { + "id": { "type": ["null", "string"] }, "properties": { - "type": ["null", "object"], + "type": "object", "properties": { - "firstname": { - "type": ["null", "object"], - "properties": { - "value": { - "type": ["null", "string"] - } - } + "associatedcompanyid": { + "type": ["null", "string"] + }, + "country": { + "type": ["null", "string"] + }, + "createdate": { + "type": ["null", "string"] + }, + "email": { + "type": ["null", "string"] }, - "lastmodifieddate": { - "type": ["null", "object"], - "properties": { - "value": { - "type": ["null", "string"], - "format": "date-time" - } - } + "hs_email_domain": { + "type": ["null", "string"] }, - "company": { - "type": ["null", "object"], - "properties": { - "value": { - "type": "string" - } - } + "hs_object_id": { + "type": ["null", "string"] }, - "lastname": { - "type": ["null", "object"], - "properties": { - "value": { - "type": ["null", "string"] - } - } + "ip_country_code": { + "type": ["null", "string"] + }, + "ip_state": { + "type": ["null", "string"] + }, + "jobtitle": { + "type": ["null", "string"] + }, + "num_associated_deals": { + "type": ["null", "string"] + }, + "state": { + "type": ["null", "string"] } } }, - "form-submissions": { - "type": ["null", "array"], - - "items": { - "type": ["null", "object"], - "properties": { - "conversion-id": { - "type": ["null", "string"] - }, - "timestamp": { - "type": ["null", "string"], - "format": "date-time" - }, - "form-id": { - "type": ["null", "string"] - }, - "portal-id": { - "type": ["null", "integer"] - }, - "title": { - "type": ["null", "string"] - }, - "form-type": { - "type": ["null", "string"] - }, - "contact-associated-by": { - "type": "array", - "items": { - "type": ["null", "string"] - } - } - } - } + "createdAt": { + "type": ["null", "string"], + "format": ["null", "date-time"] }, - "identity-profiles": { - "type": ["null", "array"], - "items": { - "type": ["null", "object"], - "properties": { - "vid": { - "type": ["null", "integer"] - }, - "saved-at-timestamp": { - "type": ["null", "string"], - "format": "date-time" - }, - "deleted-changed-timestamp": { - "type": ["null", "string"], - "format": "date-time" - }, - "identities": { - "type": ["null", "array"], - "items": { - "type": ["null", "object"], - "properties": { - "type": { - "type": ["null", "string"] - }, - "value": { - "type": ["null", "string"] - }, - "timestamp": { - "type": ["null", "string"], - "format": "date-time" - }, - "is-primary": { - "type": ["null", "boolean"] - } - } - } - } - } - } + "updatedAt": { + "type": ["null", "string"], + "format": ["null", "date-time"] + }, + "archived": { + "type": ["null", "boolean"] } } } diff --git a/tap_hubspot/stream.py b/tap_hubspot/stream.py index 51b19635..176e4484 100644 --- a/tap_hubspot/stream.py +++ b/tap_hubspot/stream.py @@ -47,8 +47,6 @@ def do_sync(self, state): start_date >= replication_value or end_date <= replication_value ): continue - if self.tap_stream_id == "contacts": - d = self.hubspot.set_associations(d) record = transformer.transform(d, self.schema, self.mdata) singer.write_record(self.tap_stream_id, record) From 9b92676a5d7dafe582b9e62149042ac15d76aad5 Mon Sep 17 00:00:00 2001 From: JingLin0 Date: Tue, 28 Apr 2020 19:45:41 +0200 Subject: [PATCH 75/78] update contacts schema --- tap_hubspot/schemas/contacts.json | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tap_hubspot/schemas/contacts.json b/tap_hubspot/schemas/contacts.json index 55fd0b58..97031c59 100644 --- a/tap_hubspot/schemas/contacts.json +++ b/tap_hubspot/schemas/contacts.json @@ -14,7 +14,8 @@ "type": ["null", "string"] }, "createdate": { - "type": ["null", "string"] + "type": ["null", "string"], + "format": "date-time" }, "email": { "type": ["null", "string"] @@ -44,11 +45,11 @@ }, "createdAt": { "type": ["null", "string"], - "format": ["null", "date-time"] + "format": "date-time" }, "updatedAt": { "type": ["null", "string"], - "format": ["null", "date-time"] + "format": "date-time" }, "archived": { "type": ["null", "boolean"] From 1de448aadb90f13164091942c2b84c0368711d24 Mon Sep 17 00:00:00 2001 From: "Patrick-Ranjit D. Madsen" Date: Tue, 5 May 2020 14:51:57 +0200 Subject: [PATCH 76/78] add ReadTimeout to handle normal timeouts --- tap_hubspot/hubspot.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tap_hubspot/hubspot.py b/tap_hubspot/hubspot.py index 2eeea30e..4ef63f74 100644 --- a/tap_hubspot/hubspot.py +++ b/tap_hubspot/hubspot.py @@ -238,6 +238,7 @@ def paginate( backoff.expo, ( requests.exceptions.RequestException, + requests.exceptions.ReadTimeout, requests.exceptions.HTTPError, ratelimit.exception.RateLimitException, ), From e34c31a3c6579386353a33851e999159ac0076ef Mon Sep 17 00:00:00 2001 From: "Patrick-Ranjit D. Madsen" Date: Tue, 5 May 2020 14:52:32 +0200 Subject: [PATCH 77/78] add max_tries=10, so we don't infinitely retry --- tap_hubspot/hubspot.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tap_hubspot/hubspot.py b/tap_hubspot/hubspot.py index 4ef63f74..2787f751 100644 --- a/tap_hubspot/hubspot.py +++ b/tap_hubspot/hubspot.py @@ -242,6 +242,7 @@ def paginate( requests.exceptions.HTTPError, ratelimit.exception.RateLimitException, ), + max_tries=10, ) @limits(calls=100, period=10) def call_api(self, url, params={}): From 7721e6673a258efb61dac586afca27da4d9e9883 Mon Sep 17 00:00:00 2001 From: "Patrick-Ranjit D. Madsen" Date: Tue, 5 May 2020 14:55:48 +0200 Subject: [PATCH 78/78] refresh token after the first 401 --- tap_hubspot/hubspot.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/tap_hubspot/hubspot.py b/tap_hubspot/hubspot.py index 2787f751..eaefb5e6 100644 --- a/tap_hubspot/hubspot.py +++ b/tap_hubspot/hubspot.py @@ -1,3 +1,4 @@ +import sys import requests from ratelimit import limits import ratelimit @@ -246,11 +247,20 @@ def paginate( ) @limits(calls=100, period=10) def call_api(self, url, params={}): - response = self.SESSION.get( - f"{self.BASE_URL}{url}", - headers={"Authorization": f"Bearer {self.access_token}"}, - params=params, - ) + url = f"{self.BASE_URL}{url}" + headers = {"Authorization": f"Bearer {self.access_token}"} + + try: + response = self.SESSION.get(url, headers=headers, params=params) + except requests.exceptions.HTTPError as err: + if not err.response.status_code == 401: + raise + + # attempt to refresh access token + self.refresh_access_token() + headers = {"Authorization": f"Bearer {self.access_token}"} + response = self.SESSION.get(url, headers=headers, params=params) + LOGGER.info(response.url) response.raise_for_status() return response.json()