阅读es aggregations文档

This commit is contained in:
asahi
2024-12-27 13:01:13 +08:00
parent b59337a36b
commit a204b15618

View File

@@ -899,4 +899,378 @@ GET /cooking_blog/_search
> ##### `must_not`
> `must_not`会淘汰不满足指定条件的文档
## 使用Query DSL分析数据
在使用kibana导入`sample ecommerce orders`的数据集后,其会创建一个名为`kibana_sample_data_ecommerce`的索引,其索引结构如下:
```
{
"kibana_sample_data_ecommerce": {
"mappings": {
"properties": {
"category": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
}
},
"currency": {
"type": "keyword"
},
"customer_birth_date": {
"type": "date"
},
"customer_first_name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"customer_full_name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"customer_gender": {
"type": "keyword"
},
"customer_id": {
"type": "keyword"
},
"customer_last_name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"customer_phone": {
"type": "keyword"
},
"day_of_week": {
"type": "keyword"
},
"day_of_week_i": {
"type": "integer"
},
"email": {
"type": "keyword"
},
"event": {
"properties": {
"dataset": {
"type": "keyword"
}
}
},
"geoip": {
"properties": {
"city_name": {
"type": "keyword"
},
"continent_name": {
"type": "keyword"
},
"country_iso_code": {
"type": "keyword"
},
"location": {
"type": "geo_point"
},
"region_name": {
"type": "keyword"
}
}
},
"manufacturer": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
}
},
"order_date": {
"type": "date"
},
"order_id": {
"type": "keyword"
},
"products": {
"properties": {
"_id": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"base_price": {
"type": "half_float"
},
"base_unit_price": {
"type": "half_float"
},
"category": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
}
},
"created_on": {
"type": "date"
},
"discount_amount": {
"type": "half_float"
},
"discount_percentage": {
"type": "half_float"
},
"manufacturer": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
}
},
"min_price": {
"type": "half_float"
},
"price": {
"type": "half_float"
},
"product_id": {
"type": "long"
},
"product_name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
},
"analyzer": "english"
},
"quantity": {
"type": "integer"
},
"sku": {
"type": "keyword"
},
"tax_amount": {
"type": "half_float"
},
"taxful_price": {
"type": "half_float"
},
"taxless_price": {
"type": "half_float"
},
"unit_discount_amount": {
"type": "half_float"
}
}
},
"sku": {
"type": "keyword"
},
"taxful_total_price": {
"type": "half_float"
},
"taxless_total_price": {
"type": "half_float"
},
"total_quantity": {
"type": "integer"
},
"total_unique_products": {
"type": "integer"
},
"type": {
"type": "keyword"
},
"user": {
"type": "keyword"
}
}
}
}
}
```
其中,`geoip.properties, products.properties`都为嵌套的object类型`geo_point`类型则是用于地理坐标。
### get metrics
#### 计算订单的平均值
通过如下请求,可以计算数据集中所有订单的平均值:
```
GET kibana_sample_data_ecommerce/_search
{
"size": 0,
"aggs": {
"avg_order_value": {
"avg": {
"field": "taxful_total_price"
}
}
}
}
```
在上述请求体中,各属性分别代表如下含义:
- size: 将`size`设置为0可以避免在返回的结果中包含`匹配的文档`size设置为0后返回结果中只会包含聚合的结果
- `avg_order_value`为该项metric的name
- `avg`为聚合类型,会计算算数平均
该请求的返回结果如下所示:
```
{
"took": 0,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 4675,
"relation": "eq"
},
"max_score": null,
"hits": []
},
"aggregations": {
"avg_order_value": {
"value": 75.05542864304813
}
}
}
```
返回结果中各属性含义如下所示:
- `hits.total.value`:代表数据集中的订单数量
- `hits.hits`为空,因为请求体中设置了`size`为0
- `aggregations`中包含聚合结果请求体中为metric指定的结果为`avg_order_value`故而该项metric位于`aggregations.avg_order_value`
#### 在单个请求中计算订单的多个metrics
如果想要在单个请求中结算多个metrics可以通过`stats`聚合类型:
```
{
"size":0,
"aggs":{
"order_status":{
"stats":{
"field":"taxful_total_price"
}
}
}
}
```
其中,`stats`聚合类型会返回count, min, max, avg, sum五个metrics。
其返回结果为
```
{
"aggregations": {
"order_stats": {
"count": 4675,
"min": 6.98828125,
"max": 2250,
"avg": 75.05542864304813,
"sum": 350884.12890625
}
}
}
```
#### 根据category对订单进行分组
可以根据`terms`聚合类型来对订单进行分组,
```
GET kibana_sample_data_ecommerce/_search
{
"size": 0,
"aggs": {
"sales_by_category": {
"terms": {
"field": "category.keyword",
"size": 5,
"order": { "_count": "desc" }
}
}
}
}
```
`terms`聚合类型会根据该字段的类型对文档进行分组
`"size":5`` "order": { "_count": "desc" }`设置了只会返回最多的5个category
其返回结果如下所示:
```
{
"took": 4,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 4675,
"relation": "eq"
},
"max_score": null,
"hits": []
},
"aggregations": {
"sales_by_category": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 572,
"buckets": [
{
"key": "Men's Clothing",
"doc_count": 2024
},
{
"key": "Women's Clothing",
"doc_count": 1903
},
{
"key": "Women's Shoes",
"doc_count": 1136
},
{
"key": "Men's Shoes",
"doc_count": 944
},
{
"key": "Women's Accessories",
"doc_count": 830
}
]
}
}
}
```
> #### doc_count_error_upper_bound
> 基于es的分布式结构`terms aggregations`在多个shards上运行时document计数可能会有小的误差`doc_count_error_upper_bound`的代表计数的最大可能误差
- sum_other_doc_count: 由于当前请求体中设置了`aggs.sales_by_category.terms.size`为5故而`sum_other_doc_count`代表未包含在返回结果中的文档数量
-