-
Notifications
You must be signed in to change notification settings - Fork 188
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
First implementation of the behavioral_analytics track. #395
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
## Behavioral Analytcis Track | ||
|
||
This track is for benchmarking behavioral analytics event ingestion. | ||
|
||
The dataset contains ~25M events randomly generated. | ||
|
||
To generate the JSON dataset run this command: | ||
|
||
```bash | ||
python3 _tools/event_generator.py > documents.json | ||
```` | ||
|
||
### Example Documents | ||
|
||
```json | ||
{"event_type": "page_view", "payload": {"session": {"id": "9e148f81-746f-4a09-8818-8c2812284da6"}, "user": {"id": "d0206d74-7cdf-4a34-8925-6c672555fc8c"}, "page": {"url": "http://elastic.co/f6bb638a-c3e3-493c-b288-4cdd11be46b0?82655ac8-fb8b-4de7-a75b-f786399b0431", "title": "4287c28a-e312-4222-b094-2c997fc1e403", "referrer": "http://elastic.co/a7c6fb92-ffa3-4aba-963f-daea3c10367d?abbb253e-3757-42c9-803c-d8e5b1805365"}, "document": {"id": "491248f8-b15e-4221-933a-9af752710184", "index": "index-2d"}}} | ||
``` | ||
|
||
|
||
```json | ||
{"event_type": "search", "payload": {"session": {"id": "9e148f81-746f-4a09-8818-8c2812284da6"}, "user": {"id": "d0206d74-7cdf-4a34-8925-6c672555fc8c"}, "search": {"query": "4ee62034-275e-4ef0-9d2f-2d8fad0d92e9", "search_application": "index-1d", "page": {"current": 78, "size": 50}, "sort": {"name": "relevance", "direction": "desc"}, "results": {"total_results": 8863, "items": [{"page": {"url": "http://elastic.co/6160062a-b7d9-40f3-b44d-f803707bc327?df17e5ea-dbe4-42c2-b77d-d3ef8060eb8e"}, "document": {"id": "96570382-359c-48c7-8bd7-cb3ffbdfc3a8", "index": "index-1d"}}, {"page": {"url": "http://elastic.co/94711df7-f592-4398-a493-911ff68856d0?85982040-8d0f-468f-bacf-cdfe71af4da4"}, "document": {"id": "9e6a5d69-a3f3-4576-af54-3b473b61b2b9", "index": "index-1d"}}, {"page": {"url": "http://elastic.co/9e193944-6d69-4c71-a439-4f319b56b6f8?de211b26-11a5-4c90-bc78-56aae20a6330"}, "document": {"id": "0d168994-2795-4167-b2e8-f970c58467a0", "index": "index-2d"}}, {"page": {"url": "http://elastic.co/1967cb79-7300-462d-8788-05ab7548bd6a?3862e1e7-7925-4e45-9200-1fce4c056132"}, "document": {"id": "7fcc3fc1-876a-44c0-acb9-9ea443c824c8", "index": "index-2d"}}, {"page": {"url": "http://elastic.co/50592d77-b3f4-4f32-b6b7-31a5266e47c2?70ed0ab6-a28c-4388-b5b6-f4ed520462d3"}, "document": {"id": "86b415e5-0450-421b-a2bc-c3f42cc74898", "index": "index-1d"}}, {"page": {"url": "http://elastic.co/1d4b487a-ea95-4160-a8fd-e1782b63169b?626502c6-222e-4e4d-87ce-dbd649d70922"}, "document": {"id": "a9547e91-29b7-4f13-b234-8db6e0e161cd", "index": "index-1d"}}, {"page": {"url": "http://elastic.co/d3882040-7bbf-46ca-86d6-96dde61d1d1f?9a77af05-2f01-4848-ac36-fbcfe3ea684b"}, "document": {"id": "5603497e-f082-4eaf-9b79-90b078d7daeb", "index": "index-1d"}}, {"page": {"url": "http://elastic.co/368500fa-b4f9-4e71-bc43-d106ba7c6f50?34a624a4-63b7-47cb-9751-e75edf5abdb2"}, "document": {"id": "8ec05a68-05c0-49ea-9957-c36b46b8df96", "index": "index-2d"}}, {"page": {"url": "http://elastic.co/b24a1b1a-8535-41bf-ac35-8ec7f8ca11d6?aa3409f5-8a7a-42ac-a841-f5084e996efe"}, "document": {"id": "69897531-3aef-46d7-b279-409b1e5dcd5e", "index": "index-2d"}}, {"page": {"url": "http://elastic.co/7c74f14f-fbbc-425f-b0e5-8bcc18f0469c?690849b8-7723-4e3e-bc05-ad060891b041"}, "document": {"id": "353f6706-9991-4750-b8e5-4c09bf8679c8", "index": "index-1d"}}]}}}} | ||
``` | ||
|
||
### Parameters | ||
|
||
This track accepts the following parameters with Rally 0.8.0+ using `--track-params`: | ||
|
||
* `target_throughput` (default: 2000): Number of events per seconds submitted by the clients. | ||
* `ingest_clients` (default: 1): Number of clients that issue event post requests. |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,128 @@ | ||
#!/usr/bin/env python3 | ||
|
||
import json | ||
import random | ||
import struct | ||
import sys | ||
import uuid | ||
|
||
# Constants used in event randomization | ||
num_sessions = 1000000 | ||
num_paths = 50000 | ||
num_query_params = 1000 | ||
num_queries = 10000 | ||
num_docs = 50000 | ||
num_index = 3 | ||
num_queries = 20000 | ||
num_search_apps = 2 | ||
num_result = 10 | ||
search_ratio = 30 | ||
|
||
|
||
# Function used to generate random events parts. | ||
def random_identifier(): | ||
return str(uuid.uuid4()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What are your thoughts on making the randomness reproducible by using a random seed and submitting the randomness to the uuid4 function? (It's not because we can that we should, I'm just wondering.) |
||
|
||
|
||
random_paths = list(map(lambda _: random_identifier(), range(1, num_paths))) | ||
random_query_params = list(map(lambda _: random_identifier(), range(1, num_query_params))) | ||
random_titles = list(map(lambda _: random_identifier(), range(1, num_paths))) | ||
random_docs = list(map(lambda _: random_identifier(), range(1, num_docs))) | ||
random_indices = list(map(lambda i: "index-%sd" % (i), range(1, num_index))) | ||
random_search_applications = list(map(lambda i: "index-%sd" % (i), range(1, num_search_apps))) | ||
random_queries = list(map(lambda _: random_identifier(), range(1, num_queries))) | ||
Comment on lines
+27
to
+33
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why use
|
||
|
||
|
||
def random_url(): | ||
return "http://elastic.co/%s?%s" % (random.choice(random_paths), random.choice(random_query_params)) | ||
|
||
|
||
def random_search_result_item(): | ||
return { | ||
"page": { | ||
"url": random_url(), | ||
}, | ||
"document": { | ||
"id": random.choice(random_docs), | ||
"index": random.choice(random_indices), | ||
}, | ||
} | ||
|
||
|
||
def base_event_payload(session_id, user_id): | ||
return { | ||
"session": { | ||
"id": session_id, | ||
}, | ||
"user": { | ||
"id": user_id, | ||
}, | ||
} | ||
|
||
|
||
def random_page_view_event(session_id, user_id): | ||
return { | ||
"event_type": "page_view", | ||
"payload": { | ||
**base_event_payload(session_id, user_id), | ||
**{ | ||
"page": { | ||
"url": random_url(), | ||
"title": random.choice(random_titles), | ||
"referrer": random_url(), | ||
}, | ||
"document": { | ||
"id": random.choice(random_docs), | ||
"index": random.choice(random_indices), | ||
}, | ||
}, | ||
}, | ||
} | ||
|
||
|
||
def random_search_event(session_id, user_id): | ||
return { | ||
"event_type": "search", | ||
"payload": { | ||
**base_event_payload(session_id, user_id), | ||
**{ | ||
"search": { | ||
"query": random.choice(random_queries), | ||
"search_application": random.choice(random_search_applications), | ||
"page": { | ||
"current": random.randint(1, 100), | ||
"size": random.choice([10, 20, 50]), | ||
}, | ||
"sort": { | ||
"name": random.choice(["relevance", "name", "price"]), | ||
"direction": random.choice(["asc", "desc"]), | ||
}, | ||
"results": { | ||
"total_results": random.randint(1, 10000), | ||
"items": list(map(lambda _: random_search_result_item(), range(num_result))), | ||
}, | ||
} | ||
}, | ||
}, | ||
} | ||
|
||
|
||
def random_event(session_id, user_id): | ||
if random.randint(1, 100) < search_ratio: | ||
return random_search_event(session_id, user_id) | ||
return random_page_view_event(session_id, user_id) | ||
|
||
|
||
try: | ||
from tqdm import tqdm | ||
|
||
iterate = lambda i: tqdm(range(i)) | ||
except ModuleNotFoundError: | ||
print("Warning: [tqdm] package is not available and you won't be able to see progress.", file=sys.stderr) | ||
iterate = range | ||
|
||
for _ in iterate(num_sessions): | ||
session_id = random_identifier() | ||
user_id = random_identifier() | ||
for i in range(1, random.randint(1, 50)): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This 50 could also be a constant. |
||
print(json.dumps(random_event(session_id, user_id), ensure_ascii=False)) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
{ | ||
"name": "event-ingest", | ||
"description": "Send the whole event corpus to the behavioral_analytics ingest API", | ||
"default": true, | ||
"schedule": [ | ||
{ | ||
"operation": "delete-analytics-collection", | ||
"clients": 1 | ||
}, | ||
{ | ||
"operation": "create-analytics-collection", | ||
"clients": 1 | ||
}, | ||
{ | ||
"operation": "check-cluster-health", | ||
"clients": 1 | ||
}, | ||
{ | ||
"operation": "event-ingest", | ||
"warmup-time-period": {{ ingest_warmup | default(40) | int }}, | ||
"clients": {{ingest_clients | default(5) | int }}, | ||
"target-throughput": {{target_throughput | default(2000) | int }} | ||
}, | ||
{ | ||
"operation": "refresh-after-index" | ||
}, | ||
|
||
{ | ||
"operation": "wait-until-merges-finish-after-index" | ||
} | ||
] | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
events.json.bz2 | ||
events-1k.json.bz2 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
{ | ||
"name": "create-analytics-collection", | ||
"operation-type": "create-data-stream" | ||
}, | ||
{ | ||
"name": "delete-analytics-collection", | ||
"operation-type": "delete-data-stream" | ||
}, | ||
{ | ||
"name": "check-cluster-health", | ||
"operation-type": "cluster-health", | ||
"request-params": { | ||
"wait_for_status": "green" | ||
}, | ||
"retry-until-success": true | ||
}, | ||
{ | ||
"name": "event-ingest", | ||
"operation-type": "raw-request", | ||
"param-source": "event-ingest-param-source" | ||
}, | ||
{ | ||
"name": "refresh-after-index", | ||
"operation-type": "refresh", | ||
"request-timeout": 1000 | ||
}, | ||
{ | ||
"name": "wait-until-merges-finish-after-index", | ||
"operation-type": "index-stats", | ||
"index": "_all", | ||
"condition": { | ||
"path": "_all.total.merges.current", | ||
"expected-value": 0 | ||
}, | ||
"retry-until-success": true, | ||
"include-in-reporting": false | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
{% import "rally.helpers" as rally with context %} | ||
|
||
{ | ||
"version": 2, | ||
"description": "Behavioral analytics related benchmarks", | ||
"data-streams": [ | ||
{ | ||
"name": "behavioral_analytics-events-test_collection" | ||
} | ||
], | ||
"corpora": [ | ||
{ | ||
"name": "behavioral_analytics", | ||
"base-url": "https://rally-tracks.elastic.co/behavioral_analytics", | ||
"documents": [ | ||
{ | ||
"source-file": "events.json.bz2", | ||
"document-count": 24514512 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you please add uncompressed-bytes too? |
||
} | ||
] | ||
} | ||
], | ||
"challenges": [ | ||
{{ rally.collect(parts="challenges/*.json") }} | ||
], | ||
"operations": [ | ||
{{ rally.collect(parts="operations/*.json") }} | ||
] | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Python constants are written in ALL_CAPS per PEP 8