Ch02-Kylin 之 Cube
May 21, 2021
Kylin 引入了一个非常重要的概念 —— Cube 和 Cuboid,Cube 由若干 Cuboid 组成。
1. 前置知识 #
名词 | 说明 |
---|---|
维度 | 可以简单理解为观察数据的角度,一般是一组离散的值。 |
度量 | 被聚合的统计值,也是聚合运算的结果,一般指聚合函数 (如:sum、count、average 等)。 |
比如下述 SQL 语句中,GROUP BY d_year, p_brand 中的 d_year 和 p_brand 就是维度。而 SELECT SUM(lo_revenue) AS lo_revenue 中的 SUM(lo_revenue) 就是度量。
SELECT SUM(lo_revenue) AS lo_revenue, d_year, p_brand
FROM p_lineorder
LEFT JOIN dates ON lo_orderdate = d_datekey
LEFT JOIN part ON lo_partkey = p_partkey
LEFT JOIN supplier ON lo_suppkey = s_suppkey
WHERE p_category = 'MFGR#12'
AND s_region = 'AMERICA'
GROUP BY d_year, p_brand
ORDER BY d_year, p_brand;
2. Cube #
而如果将表中的维度随意组合,那么每种组合我们称为 cuboid,而将这些所有的组合统称为 cube,因此其之间的关系如下图所示。
每一种维度组合称之为 Cuboid(上图中的每个点),所有 Cuboid 的集合是 Cube(上图中的整张图)。其中由所有维度组成的 Cuboid 称为 Base Cuboid,图中(time,item,location,supplier)即为 Base Cuboid,所有的 Cuboid 都可以基于 Base Cuboid 计算出来。Cube 在 kylin 中用 json 表示,如附件所示。
Model VS Cube
Model 是 Cube 的基础,用于描述一个数据模型
- 有了数据模型,定义 Cube 可以直接从此模型定义的表和列中进行选择
- 基于一个数据模型可以创建多个 Cube
3. 维度爆炸 #
其实很容易发现,维度如果设计的稍微不合理,就会导致出现若干 cuboid (最多 2^n 个),而根据业务场景,可能相当多的 cuboid 根本用不到,因此需要一定的剪枝手段,来避免构建如此大量的 cuboid。
因此 Kylin 在构建 Cube 的时候,用户可以选择 Mandatory Dimensions,Hierarchy Dimensions,Joint Dimensions 中的一种,进行剪枝,避免维度爆炸的问题。
Entry | 名词 | 说明 |
---|---|---|
Mandatory Dimensions | 必要维度 | 所有不含此维度的 cuboid 就可以被跳过计算 |
Hierarchy Dimensions | 层级维度 | 例如“国家” -> “省” -> “市”是一个层级;不符合此层级关系的 cuboid 可以被跳过计算 |
Joint Dimensions | 联合维度 | 有些维度往往一起出现,或者它们的基数非常接近(有 1:1 映射关系)。例如“user_id”和“email”。把多个维度定义为组合关系后,所有不符合此关系的 cuboids 会被跳过计算 |
4. 附件 #
4.1 model.json #
{
"uuid": "c5e7d814-c2f7-e312-f470-0a609b517684",
"last_modified": 1623556297179,
"version": "4.0.0.0",
"name": "model_demo",
"owner": "ADMIN",
"is_draft": false,
"description": "",
"fact_table": "DEMO.T_STUDENT",
"lookups": [
{
"table": "DEMO.T_PART",
"kind": "LOOKUP",
"alias": "T_PART",
"join": {
"type": "left",
"primary_key": [
"T_PART.P_PARTKEY"
],
"foreign_key": [
"T_STUDENT.STU_PARTKEY"
]
}
},
{
"table": "DEMO.T_DATES",
"kind": "LOOKUP",
"alias": "T_DATES",
"join": {
"type": "left",
"primary_key": [
"T_DATES.D_DATEKEY"
],
"foreign_key": [
"T_STUDENT.STU_DATEKEY"
]
}
}
],
"dimensions": [
{
"table": "T_STUDENT",
"columns": [
"STU_NAME",
"STU_CLASS",
"STU_PARTKEY",
"STU_DATEKEY"
]
},
{
"table": "T_PART",
"columns": [
"P_CATEGORY",
"P_PARTKEY"
]
},
{
"table": "T_DATES",
"columns": [
"D_DATE",
"D_DATEKEY"
]
}
],
"metrics": [
"T_STUDENT.STU_AGE",
"T_STUDENT.STU_SCORE"
],
"filter_condition": "",
"partition_desc": {
"partition_date_column": null,
"partition_time_column": null,
"partition_date_start": 0,
"partition_date_format": "yyyy-MM-dd",
"partition_time_format": "HH:mm:ss",
"partition_type": "APPEND",
"partition_condition_builder": "org.apache.kylin.metadata.model.PartitionDesc$DefaultPartitionConditionBuilder"
},
"capacity": "MEDIUM",
"projectName": "project_demo"
}
4.2 cube.json #
{
"uuid": "9466fff1-072b-de1b-c80b-e3b2ce39868c",
"last_modified": 1623556408303,
"version": "4.0.0.0",
"name": "cube_demo",
"is_draft": false,
"model_name": "model_demo",
"description": "",
"null_string": null,
"dimensions": [
{
"name": "STU_NAME",
"table": "T_STUDENT",
"column": "STU_NAME",
"derived": null
},
{
"name": "STU_CLASS",
"table": "T_STUDENT",
"column": "STU_CLASS",
"derived": null
},
{
"name": "STU_DATEKEY",
"table": "T_STUDENT",
"column": "STU_DATEKEY",
"derived": null
},
{
"name": "STU_PARTKEY",
"table": "T_STUDENT",
"column": "STU_PARTKEY",
"derived": null
},
{
"name": "P_PARTKEY",
"table": "T_PART",
"column": null,
"derived": [
"P_PARTKEY"
]
},
{
"name": "P_CATEGORY",
"table": "T_PART",
"column": null,
"derived": [
"P_CATEGORY"
]
},
{
"name": "D_DATEKEY",
"table": "T_DATES",
"column": null,
"derived": [
"D_DATEKEY"
]
},
{
"name": "D_DATE",
"table": "T_DATES",
"column": null,
"derived": [
"D_DATE"
]
}
],
"measures": [
{
"name": "_COUNT_",
"function": {
"expression": "COUNT",
"parameter": {
"type": "constant",
"value": "1"
},
"returntype": "bigint"
}
},
{
"name": "MIN_AGE",
"function": {
"expression": "MIN",
"parameter": {
"type": "column",
"value": "T_STUDENT.STU_AGE"
},
"returntype": "integer"
}
},
{
"name": "SUM_SCORE",
"function": {
"expression": "SUM",
"parameter": {
"type": "column",
"value": "T_STUDENT.STU_SCORE"
},
"returntype": "bigint"
}
}
],
"dictionaries": [],
"rowkey": {
"rowkey_columns": [
{
"column": "T_STUDENT.STU_NAME",
"encoding": "dict",
"isShardBy": false
},
{
"column": "T_STUDENT.STU_CLASS",
"encoding": "dict",
"isShardBy": false
},
{
"column": "T_STUDENT.STU_DATEKEY",
"encoding": "dict",
"isShardBy": false
},
{
"column": "T_STUDENT.STU_PARTKEY",
"encoding": "dict",
"isShardBy": false
}
]
},
"hbase_mapping": {
"column_family": [
{
"name": "F1",
"columns": [
{
"qualifier": "M",
"measure_refs": [
"_COUNT_",
"MIN_AGE",
"SUM_SCORE"
]
}
]
}
]
},
"aggregation_groups": [
{
"includes": [
"T_STUDENT.STU_NAME",
"T_STUDENT.STU_CLASS",
"T_STUDENT.STU_DATEKEY",
"T_STUDENT.STU_PARTKEY"
],
"select_rule": {
"hierarchy_dims": [],
"mandatory_dims": [],
"joint_dims": []
}
}
],
"signature": "6LXWNhNdvJC74BdBNyGRNw==",
"notify_list": [],
"status_need_notify": [
"ERROR",
"DISCARDED",
"SUCCEED"
],
"partition_date_start": 0,
"partition_date_end": 3153600000000,
"auto_merge_time_ranges": [
604800000,
2419200000
],
"volatile_range": 0,
"retention_range": 0,
"engine_type": 6,
"storage_type": 4,
"override_kylin_properties": {},
"cuboid_black_list": [],
"parent_forward": 3,
"mandatory_dimension_set_list": [],
"snapshot_table_desc_list": []
}