src/schemas/json/comet.json

{
  "$schema": "http://json-schema.org/draft-07/schema#",
  "title": "Comet Data Pipeline",
  "description": "JSOn Schema for Comet Data Pipeline",
  "definitions": {
    "PrimitiveType": {
      "type": "string",
      "oneOf": [
        {
          "const": "string",
          "description": ""
        },
        {
          "const": "long",
          "description": ""
        },
        {
          "const": "int",
          "description": ""
        },
        {
          "const": "short",
          "description": ""
        },
        {
          "const": "double",
          "description": ""
        },
        {
          "const": "boolean",
          "description": ""
        },
        {
          "const": "byte",
          "description": ""
        },
        {
          "const": "date",
          "description": ""
        },
        {
          "const": "timestamp",
          "description": ""
        },
        {
          "const": "decimal",
          "description": ""
        },
        {
          "const": "struct",
          "description": ""
        }
      ]
    },
    "IndexMapping": {
      "type": "string",
      "oneOf": [
        {
          "const": "text",
          "description": ""
        },
        {
          "const": "keyword",
          "description": ""
        },
        {
          "const": "long",
          "description": ""
        },
        {
          "const": "integer",
          "description": ""
        },
        {
          "const": "short",
          "description": ""
        },
        {
          "const": "byte",
          "description": ""
        },
        {
          "const": "double",
          "description": ""
        },
        {
          "const": "float",
          "description": ""
        },
        {
          "const": "half_float",
          "description": ""
        },
        {
          "const": "scaled_float",
          "description": ""
        },
        {
          "const": "date",
          "description": ""
        },
        {
          "const": "boolean",
          "description": ""
        },
        {
          "const": "binary",
          "description": ""
        },
        {
          "const": "integer_rang",
          "description": ""
        },
        {
          "const": "float_range",
          "description": ""
        },
        {
          "const": "long_range",
          "description": ""
        },
        {
          "const": "double_range",
          "description": ""
        },
        {
          "const": "date_range",
          "description": ""
        },
        {
          "const": "geo_point",
          "description": ""
        },
        {
          "const": "geo_shape",
          "description": ""
        },
        {
          "const": "ip",
          "description": ""
        },
        {
          "const": "completion",
          "description": ""
        },
        {
          "const": "token_count",
          "description": ""
        },
        {
          "const": "object",
          "description": ""
        },
        {
          "const": "array",
          "description": ""
        }
      ]
    },
    "Engine": {
      "description": "SPARK or BQ. Default value is SPARK.",
      "type": "string",
      "oneOf": [
        {
          "const": "BQ",
          "description": ""
        },
        {
          "const": "SPARK",
          "description": ""
        }
      ]
    },
    "WriteMode": {
      "description": "Append to or overwrite existing data",
      "type": "string",
      "oneOf": [
        {
          "const": "OVERWRITE",
          "description": ""
        },
        {
          "const": "APPEND",
          "description": ""
        },
        {
          "const": "ERROR_IF_EXISTS",
          "description": ""
        },
        {
          "const": "IGNORE",
          "description": ""
        }
      ]
    },
    "UserType": {
      "type": "string",
      "oneOf": [
        {
          "const": "SA",
          "description": ""
        },
        {
          "const": "USER",
          "description": ""
        },
        {
          "const": "GROUP",
          "description": ""
        }
      ]
    },
    "Trim": {
      "type": "string",
      "oneOf": [
        {
          "const": "LEFT",
          "description": ""
        },
        {
          "const": "RIGHT",
          "description": ""
        },
        {
          "const": "BOTH",
          "description": ""
        },
        {
          "const": "NONE",
          "description": ""
        }
      ]
    },
    "SinkType": {
      "description": "Where to sink the data",
      "type": "string",
      "oneOf": [
        {
          "const": "NONE",
          "description": "Don't sink. This is the default"
        },
        {
          "const": "JBDC",
          "description": "dataset will be sinked to a JDBC Database. See JdbcSink"
        },
        {
          "const": "BQ",
          "description": "Dataset is sinked to BigQuery. See BigQuerySink"
        },
        {
          "const": "ES",
          "description": "Dataset is indexed into Elasticsearch. See EsSink below"
        },
        {
          "const": "FS",
          "description": "Sink to Filesystem"
        },
        {
          "const": "KAFKA",
          "description": "Sink to Kafka"
        }
      ]
    },
    "Mode": {
      "description": "FILE mode by default.\\nFILE and STREAM are the two accepted values.\\nFILE is currently the only supported mode.",
      "type": "string",
      "oneOf": [
        {
          "const": "FILE",
          "description": ""
        },
        {
          "const": "STREAM",
          "description": ""
        },
        {
          "const": "FILE_AND_STREAM",
          "description": ""
        }
      ]
    },
    "Type": {
      "type": "object",
      "properties": {
        "name": {
          "type": "string"
        },
        "primitiveType": {
          "$ref": "#/definitions/PrimitiveType"
        },
        "pattern": {
          "type": "string"
        },
        "zone": {
          "type": "string",
          "description": "Useful for timestamp / dates"
        },
        "sample": {
          "type": "string"
        },
        "comment": {
          "type": "string"
        },
        "indexMapping": {
          "type": "string"
        }
      },
      "required": [
        "name",
        "pattern",
        "primitiveType"
      ]
    },
    "Partition": {
      "description": "Partition columns, no partitioning by default",
      "type": "object",
      "properties": {
        "sampling": {
          "type": "number",
          "description": "0.0 means no sampling, > 0  && < 1 means sample dataset, >=1 absolute number of partitions."
        },
        "attributes": {
          "type": "array",
          "items": {
            "type": "string",
            "description": "Attributes used to partition de dataset."
          }
        }
      },
      "required": []
    },
    "Position": {
      "type": "object",
      "properties": {
        "first": {
          "type": "number"
        },
        "last": {
          "type": "number"
        }
      },
      "required": [
        "first",
        "last"
      ]
    },
    "RowLevelSecurity": {
      "description": "Row level security policy to apply too the output data.",
      "type": "object",
      "properties": {
        "name": {
          "type": "string",
          "description": "This Row Level Security unique name"
        },
        "predicate": {
          "type": "string",
          "description": "The condition that goes to the WHERE clause and limit the visible rows."
        },
        "grants": {
          "description": "user / groups / service accounts to which this security level is applied.\nex : user:me@mycompany.com,group:group@mycompany.com,serviceAccount:mysa@google-accounts.com",
          "type": "array",
          "items": {
            "type": "string"
          }
        }
      }
    },
    "MergeOptions": {
      "type": "object",
      "properties": {
        "key": {
          "description": "list of attributes to join existing with incoming dataset. Use renamed columns here.",
          "type": "array",
          "items": {
            "type": "string"
          }
        },
        "delete": {
          "type": "string",
          "description": "Optional valid sql condition on the incoming dataset. Use renamed column here."
        },
        "timestamp": {
          "type": "string",
          "description": "Timestamp column used to identify last version, if not specified currently ingested row is considered the last"
        },
        "queryFilter": {
          "type": "string"
        }
      },
      "required": [
        "key"
      ]
    },
    "Format": {
      "description": "DSV by default. Supported file formats are :\\n- DSV : Delimiter-separated values file. Delimiter value is specified in the \"separator\" field.\\n- POSITION : FIXED format file where values are located at an exact position in each line.\\n- SIMPLE_JSON : For optimisation purpose, we differentiate JSON with top level values from JSON\\n  with deep level fields. SIMPLE_JSON are JSON files with top level fields only.\\n- JSON :  Deep JSON file. Use only when your json documents contain subdocuments, otherwise prefer to\\n  use SIMPLE_JSON since it is much faster.\\n- XML : XML files",
      "type": "string",
      "oneOf": [
        {
          "const": "DSV",
          "description": ""
        },
        {
          "const": "POSITION",
          "description": ""
        },
        {
          "const": "JSON",
          "description": ""
        },
        {
          "const": "ARRAY_JSON",
          "description": ""
        },
        {
          "const": "SIMPLE_JSON",
          "description": "Simple Json is made of a single level attributes of simple types (no array or map or sub objects)"
        },
        {
          "const": "XML",
          "description": ""
        }
      ]
    },
    "MapString": {
      "type": "object",
      "additionalProperties": {
        "type": "string"
      }
    },
    "Sink": {
      "type": "object",
      "properties": {
        "type": {
          "$ref": "#/definitions/SinkType",
          "description": "Once ingested, files may be sinked to BigQuery, Elasticsearch or any JDBC compliant Database."
        },
        "name": {
          "type": "string",
          "description": "Once ingested, files may be sinked to BigQuery, Elasticsearch or any JDBC compliant Database."
        },
        "id": {
          "type": "string",
          "description": "ES: Attribute to use as id of the document. Generated by Elasticsearch if not specified."
        },
        "timestamp": {
          "type": "string",
          "description": "BQ: The timestamp column to use for table partitioning if any. No partitioning by default\\nES:Timestamp field format as expected by Elasticsearch (\"{beginTs|yyyy.MM.dd}\" for example)."
        },
        "location": {
          "type": "string",
          "description": "BQ: Database location (EU, US, ...)"
        },
        "clustering": {
          "type": "string",
          "description": "BQ: List of ordered columns to use for table clustering"
        },
        "days": {
          "type": "number",
          "description": "BQ: Number of days before this table is set as expired and deleted. Never by default."
        },
        "requirePartitionFilter": {
          "type": "boolean",
          "description": "BQ: Should be require a partition filter on every request ? No by default."
        },
        "connection": {
          "type": "string",
          "description": "JDBC: Connection String"
        },
        "partitions": {
          "type": "number",
          "description": "JDBC: Number of Spark partitions"
        },
        "batchSize": {
          "type": "number",
          "description": "JDBC: Batch size of each JDBC bulk insert"
        }
      },
      "required": [
        "type"
      ]

    },
    "Metadata": {
      "type": "object",
      "properties": {
        "partition": {
          "$ref": "#/definitions/Partition"
        },
        "mode": {
          "$ref": "#/definitions/Mode"
        },
        "format": {
          "$ref": "#/definitions/Format"
        },
        "encoding": {
          "type": "string",
          "description": "UTF-8 if not specified."
        },
        "multiline": {
          "type": "boolean",
          "description": "are json objects on a single line or multiple line ? Single by default.  false means single. false also means faster"
        },
        "array": {
          "type": "boolean",
          "description": "Is the json stored as a single object array ? false by default. This means that by default we have on json document per line."
        },
        "withHeader": {
          "type": "boolean",
          "description": "does the dataset has a header ? true bu default"
        },
        "separator": {
          "type": "string",
          "description": "the values delimiter,  ';' by default value may be a multichar string starting from Spark3"
        },
        "quote": {
          "type": "string",
          "description": "The String quote char, '\"' by default"
        },
        "escape": {
          "type": "string",
          "description": "escaping char '\\' by default"
        },
        "write": {
          "$ref": "#/definitions/WriteMode",
          "description": "Write mode, APPEND by default"
        },
        "sink": {
          "$ref": "#/definitions/Sink"
        },
        "clustering": {
          "description": "List of attributes to use for clustering",
          "type": "array",
          "items": {
            "type": "string"
          }
        },
        "ignore": {
          "type": "string",
          "description": "Pattern to ignore or UDF to apply to ignore some lines"
        },
        "xml": {
          "$ref": "#/definitions/MapString",
          "description": "com.databricks.spark.xml options to use (eq. rowTag)"
        }
      }
    },
    "Schema": {
      "type": "object",
      "properties": {
        "name": {
          "type": "string",
          "description": "Schema name, must be unique among all the schemas belonging to the same domain.\n  *                     Will become the hive table name On Premise or BigQuery Table name on GCP."
        },
        "pattern": {
          "description": "filename pattern to which this schema must be applied.\n  *                     This instructs the framework to use this schema to parse any file with a filename that match this pattern.",
          "type": "string"
        },
        "primaryKey": {
          "description": "List of columns that make up the primary key",
          "type": "array",
          "items": {
            "type": "string"
          }
        },
        "attributes": {
          "description": "Attributes parsing rules.",
          "type": "array",
          "items": {
            "$ref": "#/definitions/Attribute"
          }
        },
        "metadata": {
          "$ref": "#/definitions/Metadata",
          "description": "Dataset metadata"
        },
        "merge": {
          "$ref": "#/definitions/MergeOptions"
        },
        "comment": {
          "type": "string",
          "description": "free text"
        },
        "presql": {
          "type": "array",
          "description": "Reserved for future use.",
          "items": {
            "type": "string"
          }
        },
        "postsql": {
          "type": "array",
          "description": "Reserved for future use.",
          "items": {
            "type": "string"
          }
        },
        "tags": {
          "description": "Set of string to attach to this Schema",
          "type": "array",
          "items": {
            "type": "string"
          }
        },
        "rls": {
          "description": " Experimental. Row level security to this to this schema.",
          "type": "array",
          "items": {
            "$ref": "#/definitions/RowLevelSecurity"
          }
        },
        "assertions": {
          "$ref": "#/definitions/MapString",
          "description": "Assertions to check after Load / Transform has succeeded"
        }
      }
    },
    "Attribute": {
      "type": "object",
      "properties": {
        "name": {
          "type": "string",
          "description": "Attribute name as defined in the source dataset and as received in the file"
        },
        "type": {
          "type": "string",
          "description": "semantic type of the attribute"
        },
        "foreignKey": {
          "type": "string",
          "description": "If this attribute is a foreign key, reference to [domain.]table[.attribute]"
        },
        "array": {
          "type": "boolean",
          "description": "Is it an array ?"
        },
        "required": {
          "type": "boolean",
          "description": "Should this attribute always be present in the source"
        },
        "privacy": {
          "type": "string",
          "description": "Should this attribute be applied a privacy transformation at ingestion time"
        },
        "comment": {
          "type": "string",
          "description": "free text for attribute description"
        },
        "rename": {
          "type": "string",
          "description": "If present, the attribute is renamed with this name"
        },
        "metricType": {
          "type": "string",
          "description": "If present, what kind of stat should be computed for this field"
        },
        "attributes": {
          "type": "array",
          "description": "List of sub-attributes (valid for JSON and XML files only)",
          "items": {
            "$ref": "#/definitions/Attribute"
          }
        },
        "position": {
          "$ref": "#/definitions/Position"
        },
        "default": {
          "type": "string",
          "description": "Default value for this attribute when it is not present."
        },
        "tags": {
          "type": "array",
          "description": "Tags associated with this attribute",
          "items": {
            "type": "string"
          }
        },
        "trim": {
          "$ref": "#/definitions/Trim"
        },
        "script": {
          "type": "string",
          "description": "Scripted field : SQL request on renamed column"
        }
      },
      "required": [
        "name",
        "type"
      ]
    },
    "AutoTaskDesc": {
      "type": "object",
      "properties": {
        "sql": {
          "type": "string",
          "description": "Main SQL request to execute (do not forget to prefix table names with the database name to avoid conflicts)"
        },
        "engine": {
          "$ref": "#/definitions/Engine",
          "description": "When not SPARK, the entire SQL query is pushed down."
        },
        "domain": {
          "type": "string",
          "description": "Output domain in output Area (Will be the Database name in Hive or Dataset in BigQuery)"
        },
        "dataset": {
          "type": "string",
          "description": "Dataset Name in output Area (Will be the Table name in Hive & BigQuery)"
        },
        "write": {
          "$ref": "#/definitions/WriteMode"
        },
        "area": {
          "type": "string",
          "description": "Target Area where domain / dataset will be stored."
        },
        "partition": {
          "type": "array",
          "items": {
            "type": "string"
          },
          "description": "List of columns used for partitioning the output."
        },
        "presql": {
          "type": "array",
          "items": {
            "type": "string"
          },
          "description": "List of SQL requests to executed before the main SQL request is run"
        },
        "postsql": {
          "type": "array",
          "items": {
            "type": "string"
          },
          "description": "List of SQL requests to executed after the main SQL request is run"
        },
        "sink": {
          "$ref": "#/definitions/Sink"
        },
        "rls": {
          "type": "array",
          "items": {
            "$ref": "#/definitions/RowLevelSecurity"
          }
        },
        "assertions": {
          "$ref": "#/definitions/MapString",
          "description": "Assertions to check after Load / Transform has succeeded"
        }
      },
      "required": [
        "domain",
        "dataset",
        "write"
      ]
    },
    "Domain": {
      "type": "object",
      "properties": {
        "name": {
          "type": "string",
          "description": "Domain name. Make sure you use a name that may be used as a folder name on the target storage.\n                   - When using HDFS or Cloud Storage,  files once ingested are stored in a sub-directory named after the domain name.\n                   - When used with BigQuery, files are ingested and sorted in tables under a dataset named after the domain name."
        },
        "directory": {
          "description": "Folder on the local filesystem where incoming files are stored.\n                     Typically, this folder will be scanned periodically to move the dataset to the cluster for ingestion.\n                     Files located in this folder are moved to the pending folder for ingestion by the \"import\" command.",
          "type": "string"
        },
        "metadata": {
          "$ref": "#/definitions/Metadata"
        },
        "schemaRefs": {
          "type": "array",
          "description": "List of files containing the schemas. Should start with an '_' and be located in the same folder.",
          "items": {
            "type": "string"
          }
        },
        "schemas": {
          "type": "array",
          "description": "List of schemas for each dataset in this domain.\nA domain usually contains multiple schemas. Each schema defining how the contents of the input file should be parsed.\nSee Schema for more details.",
          "items": {
            "$ref": "#/definitions/Schema"
          }
        },
        "comment": {
          "description": "Domain Description (free text)",
          "type": "string"
        },
        "extensions": {
          "type": "array",
          "items": {
            "type": "string"
          },
          "description": "recognized filename extensions. json, csv, dsv, psv are recognized by default.\nOnly files with these extensions will be moved to the pending folder."
        },
        "ack": {
          "description": "Ack extension used for each file. \".ack\" if not specified.\nFiles are moved to the pending folder only once a file with the same name as the source file and with this extension is present.\nTo move a file without requiring an ack file to be present, set explicitly this property to the empty string value \"\".",
          "type": "string"
        }
      },
      "anyOf": [
        {
          "required": [
            "name",
            "directory",
            "schemas"
          ]
        },
        {
          "required": [
            "name",
            "directory",
            "schemaRefs"
          ]
        }
      ]
    },
    "AutoJobDesc": {
      "type": "object",
      "properties": {
        "name": {
          "type": "string",
          "description": "Job name. Must be set to the prefix of the filename. [JOBNAME].comet.yml "
        },
        "area": {
          "type": "string",
          "description": "Area where the data is located.\\nWhen using the BigQuery engine, the area corresponds to the dataset name we will be working on in this job.\\nWhen using the Spark engine, this is folder where the data should be store. Default value is \"business\""
        },
        "format": {
          "type": "string",
          "description": "output file format when using Spark engine. Ignored for BigQuery. Default value is \"parquet\""
        },
        "coalesce": {
          "type": "boolean",
          "description": "When outputting files, should we coalesce it to a single file. Useful when CSV is the output format."
        },
        "tasks": {
          "type": "array",
          "items": {
            "$ref": "#/definitions/AutoTaskDesc",
            "description": "List of transform tasks to execute"
          }
        },
        "udf": {
          "type": "string",
          "description": "Register UDFs written in this JVM class when using Spark engine.\\nRegister UDFs stored at this location when using BigQuery engine"
        },
        "views": {
          "$ref": "#/definitions/MapString",
          "description": "Create temporary views using where the key is the view name and the map the SQL request corresponding to this view using the SQL engine supported syntax."
        },
        "engine": {
          "$ref": "#/definitions/Engine"
        }
      },
      "required": [
        "name",
        "tasks"
      ]
    }
  },
  "type": "object",
  "properties": {
    "types": {
      "type": "array",
      "items": {
        "$ref": "#/definitions/Type"
      }
    },
    "env": {
      "$ref": "#/definitions/MapString"
    },
    "transform": {
      "$ref": "#/definitions/AutoJobDesc"
    },
    "load": {
      "$ref": "#/definitions/Domain"
    },
    "assertions": {
      "$ref": "#/definitions/MapString",
      "description": "Assertions library defined as a map name(params) -> sql request that should return 0 record"
    },
    "views": {
      "$ref": "#/definitions/MapString"
    },
    "schemas": {
      "type": "array",
      "description": "List of schemas for each dataset in this domain.\nA domain usually contains multiple schemas. Each schema defining how the contents of the input file should be parsed.\nSee Schema for more details.",
      "items": {
        "$ref": "#/definitions/Schema"
      }
    }
  },
  "oneOf": [
    {
      "required": [
        "load"
      ]
    },
    {
      "required": [
        "views"
      ]
    },
    {
      "required": [
        "assertions"
      ]
    },
    {
      "required": [
        "env"
      ]
    },
    {
      "required": [
        "types"
      ]
    },
    {
      "required": [
        "transform"
      ]
    },
    {
      "required": [
        "schemas"
      ]
    }
  ]
}