大家一定对sql非常熟悉,关系型数据库自不必说,现在越来越多的大数据系统也都支持sql,比如hive,odps,presto,phoenix(hbase),galaxy 以及cep(esper)等都支持sql,或者类sql语言。sql语言更接近自然语言,让人非常容易理解,上手也比较方便,可以有效降低系统的入门门槛。很多大数据系统都用antlr来实现sql,antlr帮助我们实现sql语法解析和编译、抽象语法树啊一些复杂的概念,在antlr的帮助下,简单了很多。
sql可以帮助我们实现sum,avg,max,min,count等简单的聚合计算,还可以依靠parsii(https://github.com/scireum/parsii)这种表达式解析工具实现更复杂的表达式条件过滤功能。
sql看起来是对静态数据集的一种计算操作,比如select sum(field1) from tablex,是对表tablex的某一个字段进行加和操作,数据库的表相对来讲是一个静态的数据集。但其实sql还支持流数据的计算,对静态数据集和对流数据计算本质上并没有什么区别,都是单条记录,单个事件,或者tuple之类的数据单元分别计算后再聚合的结果。
不同系统的sql被antlr编译解析完成的执行计划也完全不同,hive是mr job,galaxy是storm topology等,那么假设我们现在有一批窗口数据,或者说有限数据集,如何完成这些数据按照字段分组聚合的功能?
有时候我们会在storm中完成一些聚合操作(非trident),那就需要你自己实现groupby之类的逻辑,当然我们也可以选择Esper或者siddhi这种开源cep引擎,你只需要写写sql就可以实现你的逻辑,但是一般cep 引擎比较消耗内存和cpu,而我们仅仅需要一些基础聚合功能,用它显得不划算。
那么现在我们就自己实现一个简单的分组聚合引擎:
1、首先定义一个Javabean,用来描述一种类型的事件或者叫record,包含事件的schema和一些标签数据
import java.io.Serializable;
import java.util.Map;
public class EventBase implements Serializable{
private long timestamp;
private Map<String,String> tags;
public EventBase(){
}
public long getTimestamp() {
return timestamp;
}
void setTimestamp(long timestamp) {
this.timestamp = timestamp;
}
public Map<String> getTags() {
return tags;
}
void setTags(Map<String> tags) {
this.tags = tags;
}
String toString(){
StringBuffer sb = new StringBuffer();
sb.append("prefix:");
sb.",timestamp:");
sb.append(timestamp);
sb.);
sb.);
if(tags != null){
for(Map.Entry<String> entry : tags.entrySet()){
sb.append(entry.toString());
sb.);
}
}
sb.);
return sb.toString();
}
}
用户可以继承该事件,实现自己的事件的定义,比如:
public class TestEvent extends EventBase {
int numHosts;
private Long numClusters;
public int getNumHosts() {
return numHosts;
}
void setNumHosts(int numHosts) {
this.numHosts = numHosts;
}
public Long getNumClusters() {
return numClusters;
}
setNumClusters(Long numClusters) {
this.numClusters = numClusters;
}
public String toString(){
StringBuffer sb = new StringBuffer();
sb.append(super.toString());
return sb.toString();
}
}
2、弄一个聚合接口,然后实现它
interface Aggregator {
public process(EventBase event) throws Exception;
}
3、定义聚合类型,目前先支持sum,count这5种类型
import java.util.regex.Matcher;
import java.util.regex.Pattern;
enum AggregateType {
count("^(count)$"),sum("^sum\\((.*)\\)$"),avg("^avg\\((.*)\\)$"),max("^max\\((.*)\\)$"),min("^min\\((.*)\\)$");
private Pattern pattern;
private AggregateType(String patternString){
this.pattern = Pattern.compile(patternString);
}
public AggregateTypeMatcher matcher(String function){
Matcher m = pattern.matcher(function);
if(m.find()){
return new AggregateTypeMatcher(this,true,m.group(1));
}else{
false,255)">null);
}
}
static AggregateTypeMatcher matchAll(function){
for(AggregateType type : values()){
Matcher m = type.pattern.matcher(function);
if(m.find()){
type,255)">1));
}
}
new AggregateTypeMatcher(null,255)">null);
}
}
class AggregateTypeMatcher {
private final AggregateType type;
private final boolean matched;
String field;
public AggregateTypeMatcher(AggregateType boolean matched,116)">String field){
this.type = type;
this.matched = matched;
this.field = field;
}
public boolean find(){
this.matched;
}
String field(){
this.field;
}
public AggregateType type(){
this.type;
}
}
4、实现聚合接口
import org.apache.commons.beanutils.PropertyUtils;
import java.beans.PropertyDescriptor;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.util.ArrayList;
import java.util.List;
abstract class AbstractAggregator implements Aggregator {
static final String UNASSIGNED = "unassigned";
protected List<String> groupbyFields;
protected List<AggregateType> aggregateTypes;
String> aggregatedFields;
private Boolean[] _groupbyFieldPlacementCache;
private Method[] _aggregateFieldReflectedMethodCache;
public AbstractAggregator(List<String> groupbyFields,List<AggregateType> aggregateFuntionTypes,List<String> aggregatedFields){
this.groupbyFields = groupbyFields;
this.aggregateTypes = aggregateFuntionTypes;
this.aggregatedFields = aggregatedFields;
_aggregateFieldReflectedMethodCache = new Method[this.aggregatedFields.size()];
_groupbyFieldPlacementCache = new Boolean[this.groupbyFields.size()];
}
abstract Object result();
protected String createGroupFromTags(EventBase entity,114)">String groupbyField,116)">int i){
String groupbyFieldValue = entity.getTags().get(groupbyField);
if(groupbyFieldValue != null){
_groupbyFieldPlacementCache[i] = true;
return groupbyFieldValue;
}
null;
}
String createGroupFromQualifiers(EventBase entity,114)">try{
PropertyDescriptor pd = PropertyUtils.getPropertyDescriptor(entity,groupbyField);
if(pd == null)
null;
_groupbyFieldPlacementCache[i] = false;
return (String)(pd.getReadMethod().invoke(entity));
}catch(NoSuchMethodException ex){
null;
}catch(InvocationTargetException ex){
catch(IllegalAccessException ex){
null;
}
}
String determineGroupbyFieldValue(EventBase entity,116)">int i){
Boolean placement = _groupbyFieldPlacementCache[i];
String groupbyFieldValue = null;
if(placement != null){
groupbyFieldValue = placement.booleanValue() ? createGroupFromTags(entity,groupbyField,i) : createGroupFromQualifiers(entity,i);
}else{
groupbyFieldValue = createGroupFromTags(entity,i);
if(groupbyFieldValue == null){
groupbyFieldValue = createGroupFromQualifiers(entity,i);
}
}
groupbyFieldValue = (groupbyFieldValue == null ? UNASSIGNED : groupbyFieldValue);
return groupbyFieldValue;
}
protected List<Double> createPreAggregatedValues(EventBase entity) throws Exception{
List<Double> values = new ArrayList<Double>();
int functionIndex = 0;
for(AggregateType type : aggregateTypes){
if(type.name().equals(AggregateType.count.name())){
values.add(new Double(1));
}else{
String aggregatedField = aggregatedFields.get(functionIndex);
try {
Method m = _aggregateFieldReflectedMethodCache[functionIndex];
if (m == null) {
String tmp = aggregatedField.substring(0,1).toUpperCase() + aggregatedField.substring(1);
m = entity.getClass().getMethod("get" + tmp);
_aggregateFieldReflectedMethodCache[functionIndex] = m;
}
Object obj = m.invoke(entity);
values.add(numberToDouble(obj));
} catch (Exception ex) {
throw ex;
}
}
functionIndex++;
}
return values;
}
protected Double numberToDouble(Object obj) throws Exception {
if(obj instanceof Double)
return (Double)obj;
instanceof Integer){
new Double(((Integer)obj).doubleValue());
}
instanceof Long){
new Double(((Long)obj).doubleValue());
}
if(obj == 0.0);
}
instanceof String){
try{
new Double((String)obj);
}catch(Exception ex){
System.out.println("Datapoint ignored because it can not be converted to correct number for " + obj + ex);
0.0);
}
}
throw new Exception(obj.getClass().toString() + " type is not support. The aggregated field must be numeric type,int,long or double");
}
}
我需要提供聚合字段,聚合类型,分组字段(group by) 其中聚合字段和聚合类型 list中元素是一一对应的。为每一种聚合类型实现聚合方法和工厂类,具体聚合算法:
对于数据集中的每一个单条数据先进行预处理:
对于count类型,那么直接返回结果1;sum、avg、max、min对于单条数据来讲,直接返回对应字段的值即可。
到这里我们仅仅完成了单条的域处理,实现groupby,实际上是按照字段完成分组,分组内的数据再进行聚合
5、实现bucket
import java.util.HashMap;
import java.util.List;
import java.util.Map;
class GroupbyBucket {
static Map<String,FunctionFactory> functionFactories =
new HashMap<String,FunctionFactory>();
static{
functionFactories.put(AggregateType.count.name(),114)">new CountFactory());
functionFactories.put(AggregateType.sum.name(),114)">new SumFactory());
functionFactories.put(AggregateType.min.name(),114)">new MinFactory());
functionFactories.put(AggregateType.max.name(),114)">new MaxFactory());
functionFactories.put(AggregateType.avg.name(),114)">new AvgFactory());
}
private List<AggregateType> types;
private Map<List<String>,List<Function>> group2FunctionMap = new HashMap<List<String>,List<Function>>();
public GroupbyBucket(List<AggregateType> types){
this.types = types;
}
addDatapoint(List<String> groupbyFieldValues,List<Double> values){
List<Function> functions = group2FunctionMap.get(groupbyFieldValues);
if(functions == null){
functions = new ArrayList<Function>();
for(AggregateType type : types){
functions.add(functionFactories.get(type.name()).createFunction());
}
group2FunctionMap.put(groupbyFieldValues,functions);
}
int functionIndex = for(Double v : values){
functions.get(functionIndex).run(v);
functionIndex++;
}
}
public Map<List<String>,List<Double>> result(){
Map<List<String>,List<Double>> result = for(Map.Entry<List<String>,List<Function>> entry : this.group2FunctionMap.entrySet()){
List<Double> values = new ArrayList<Double>();
for(Function f : entry.getValue()){
values.add(f.result());
}
result.put(entry.getKey(),values);
}
return result;
}
static interface FunctionFactory{
public Function createFunction();
}
abstract Function{
int count;
abstract run(double v);
double result();
count(){
return count;
}
incrCount(){
count ++;
}
}
class CountFactory implements FunctionFactory{
@Override
new Count();
}
}
class Count Sum{
Countsuper();
}
}
class SumFactory new Sum();
}
}
class Sum double summary;
Sumthis.summary = 0.0;
}
@Override
double v){
this.incrCount();
this.summary += v;
}
@Override
this.summary;
}
}
class MinFactory new Min();
}
}
class Min double minimum;
Minthis.minimum = Double.MAX_VALUE;
}
@Override
if(v < minimum){
minimum = v;
}
this.incrCount();
}
@Override
return minimum;
}
}
class MaxFactory new Max();
}
}
class Max double maximum;
Maxthis.maximum = if(v > maximum){
maximum = v;
}
return maximum;
}
}
class AvgFactory new Avg();
}
}
class Avg double total;
Avgthis.total = double v){
total += v;
this.incrCount();
}
@Override
this.total/this.count;
}
}
}
6、分组聚合实现
import java.util.List;
Map;
public class AggregatorImple AbstractAggregator{
protected GroupbyBucket bucket;
public AggregatorImple(List<String> groupbyFields,116)">List<AggregateType> aggregateFuntionTypes,116)">String> aggregatedFields){
super(groupbyFields,aggregateFuntionTypes,aggregatedFields);
bucket = new GroupbyBucket(this.aggregateTypes);
}
public void process(EventBase entity) throws Exception{
String> groupbyFieldValues = createGroup(entity);
List<Double> preAggregatedValues = createPreAggregatedValues(entity);
bucket.addDatapoint(groupbyFieldValues,preAggregatedValues);
}
public Map<String>,116)">List<Double>> result(){
return bucket.result();
}
protected String> createGroup(EventBase entity){
String> groupbyFieldValues = new ArrayList<String>();
int i = for(String groupbyField : groupbyFields){
String groupbyFieldValue = determineGroupbyFieldValue(entity,i++);
groupbyFieldValues.add(groupbyFieldValue);
}
return groupbyFieldValues;
}
}
7、验证测试
import java.util.*;
/**
* Created by dongbin.db on 2015/12/22.
*/
public class Test {
private TestEvent createEntity(final String cluster,final String datacenter,116)">String rack,116)">int numHosts,long numClusters){
TestEvent entity = new TestEvent();
Map<String,String> tags = new HashMap<String>(){{
put("cluster",cluster);
put("datacenter",datacenter);
put("rack",rack);
}};
entity.setTags(tags);
entity.setNumHosts(numHosts);
entity.setNumClusters(numClusters);
return entity;
}
public void testSingleGroupbyFieldSingleFunctionForCount(){
TestEvent[] entities = new TestEvent[5];
entities[0] = createEntity("cluster1","dc1",116)">"rack123",255)">12,255)">2);
entities[1] = createEntity(20,255)">1);
entities[2] = createEntity("rack128",255)">10,255)">0);
entities[3] = createEntity("cluster2",116)">"rack125",255)">9,255)">4] = createEntity("dc2",116)">"rack126",255)">15,255)">2);
AggregatorImple agg = new AggregatorImple(Arrays.asList("cluster"),116)">Arrays.asList(AggregateType.count),116)">"*"));
try{
for(TestEvent e : entities){
agg.process(e);
}
List<Double>> result = agg.result();
System.out.println(result.size());
result.get("cluster1")).get(0));
"cluster2")).get(0));
}catch(Exception ex){
out.println(ex);
}
agg = new "datacenter"),114)">out.printf(String.valueOf(result.size())+"\n");
"dc1")).get("dc2")).get(out.println(ex);
}
agg = new AggregatorImple(new ArrayList<String>(),116)">AggregateType.sum),116)">"numHosts"));
result.get(new String>()).get(out.println((double)(entities[0].getNumHosts()+entities[1].getNumHosts()+
entities[2].getNumHosts()+entities[3].getNumHosts()+entities[4].getNumHosts()));
}catch(out.println(ex);
}
}
public static void main(String[] args) {
Test test = new Test();
test.testSingleGroupbyFieldSingleFunctionForCount();
}
}
代码地址:https://github.com/sumpan/groupby