HiveSql解析(基于AST)
程序员文章站
2023-11-08 21:46:10
由于项目中要和大数据团队交互,需要解析出hive中用到的表和最外层的列,后续可能还要用到各个表和字段的血缘关系,网上搜了一圈,貌似只有一个答案,基本都是复制一个模板的。而且看起来貌似也不太符合我需要的,所以想着自己解析出想要的东西,勉强够用,记录一下。 1 import com.google.com ......
由于项目中要和大数据团队交互,需要解析出hive中用到的表和最外层的列,后续可能还要用到各个表和字段的血缘关系,网上搜了一圈,貌似只有一个答案,基本都是复制一个模板的。而且看起来貌似也不太符合我需要的,所以想着自己解析出想要的东西,勉强够用,记录一下。
1 import com.google.common.base.joiner; 2 import com.google.common.collect.lists; 3 import com.google.common.collect.maps; 4 import lombok.extern.slf4j.slf4j; 5 import org.apache.commons.collections.collectionutils; 6 import org.apache.hadoop.hive.ql.lib.node; 7 import org.apache.hadoop.hive.ql.parse.astnode; 8 import org.apache.hadoop.hive.ql.parse.parsedriver; 9 10 import java.util.*; 11 import java.util.stream.collectors; 12 13 import static org.apache.hadoop.hive.ql.parse.hiveparser.*; 14 15 /** 16 * @author chentiefeng 17 * @date 2019/10/21 13:51 18 */ 19 @slf4j 20 public class hivesqlparse { 21 private parsedriver pd = new parsedriver(); 22 /** 23 * 原始表(表名,别名) 24 */ 25 private list<string[]> sourcetable = lists.newarraylist(); 26 /** 27 * 插入表 28 */ 29 private list<string> inserttables = lists.newarraylist(); 30 /** 31 * 最外层列 32 */ 33 private list<string> outermostcolumns = lists.newarraylist(); 34 /** 35 * 插入分区信息(分区列,分区值) 36 */ 37 private map<string, string> partitionmap = maps.newhashmap(); 38 /** 39 * 最外层sel节点 40 */ 41 private astnode outermostselnode = null; 42 /** 43 * 最外层insert节点 44 */ 45 private astnode outermostinsertnode = null; 46 /** 47 * 放置 解析表栈 48 */ 49 private stack<hivetableparseinfo> tableparseinfoselstack = new stack<>(); 50 private stack<hivetableparseinfo> tableparseinfofromstack = new stack<>(); 51 /** 52 * 表关系解析信息,不包含原始表 53 */ 54 private hivetableparseinfo tableparseinfo = null; 55 56 public hivesqlparse() { 57 } 58 59 public hivesqlparse(string sql) { 60 parse(sql); 61 } 62 63 /** 64 * sql解析 65 * 66 * @param sql 67 */ 68 public void parse(string sql) { 69 try { 70 astnode ast = pd.parse(sql); 71 log.info("hivesql={},asttree={}", sql, ast.tostringtree()); 72 parsenode(ast); 73 insert(outermostinsertnode); 74 outermostcolumns(outermostselnode); 75 sourcetable.removeif(arr -> arr[0].equals(inserttables.get(0))); 76 } catch (exception e) { 77 log.error(e.getmessage(), e); 78 throw new runtimeexception(e); 79 } 80 } 81 82 private void parsenode(astnode ast) { 83 if (collectionutils.isnotempty(ast.getchildren())) { 84 for (node child : ast.getchildren()) { 85 astnode cc = (astnode) child; 86 switch (cc.gettoken().gettype()) { 87 case tok_insert: 88 outermostinsertnode = cc; 89 break; 90 case tok_tabname: 91 string tablename = joiner.on(".").join(cc.getchildren().stream().map(n -> ((astnode) n).gettext()).collect(collectors.tolist())); 92 astnode ccchild = (astnode) cc.getparent().getchild(cc.getparent().getchildcount() - 1); 93 hivetableparseinfo sourcetableparseinfo = new hivetableparseinfo(); 94 if (ccchild.gettoken().gettype() == tok_tabname) { 95 sourcetable.add(new string[]{tablename, ""}); 96 sourcetableparseinfo.setalias(""); 97 } else { 98 sourcetable.add(new string[]{tablename, ccchild.gettext()}); 99 sourcetableparseinfo.setalias(ccchild.gettext()); 100 } 101 sourcetableparseinfo.setname(tablename); 102 if (!tableparseinfofromstack.empty()) { 103 tableparseinfofromstack.pop().gettables().add(sourcetableparseinfo); 104 } 105 break; 106 case tok_query: 107 astnode ccc = (astnode) cc.getparent().getchild(cc.getparent().getchildcount() - 1); 108 if (ccc.gettoken().gettype() != tok_query) { 109 hivetableparseinfo table = new hivetableparseinfo(); 110 table.setalias(ccc.gettext()); 111 tableparseinfoselstack.push(table); 112 tableparseinfofromstack.push(table); 113 } 114 break; 115 case tok_select: 116 case tok_selectdi: 117 hivetableparseinfo pop = tableparseinfoselstack.pop(); 118 if (!tableparseinfoselstack.empty()) { 119 hivetableparseinfo father = tableparseinfoselstack.peek(); 120 if (objects.nonnull(father)) { 121 father.gettables().add(pop); 122 } 123 } else { 124 tableparseinfo = pop; 125 } 126 parsecolumns(cc, pop); 127 continue; 128 default: 129 } 130 parsenode(cc); 131 } 132 } 133 } 134 135 private void insert(astnode cn) { 136 if (collectionutils.isempty(cn.getchildren())) { 137 return; 138 } 139 for (node child : cn.getchildren()) { 140 astnode cc = (astnode) child; 141 switch (cc.gettoken().gettype()) { 142 case tok_insert_into: 143 case tok_destination: 144 inserttable(cn); 145 continue; 146 case tok_select: 147 outermostselnode = cn; 148 continue; 149 default: 150 } 151 insert(cc); 152 } 153 } 154 155 private void parsecolumns(astnode cc, hivetableparseinfo table) { 156 for (node node : cc.getchildren()) { 157 astnode tokselexpr = (astnode) node; 158 hivetableparseinfo.hivetablecolumnparseinfo column = new hivetableparseinfo.hivetablecolumnparseinfo(); 159 string alias = getselexpralias(tokselexpr); 160 column.setname(alias); 161 parsecolumn(tokselexpr, column); 162 table.getcolumns().add(column); 163 } 164 } 165 166 167 private void parsecolumn(astnode tokselexpr, hivetableparseinfo.hivetablecolumnparseinfo column) { 168 if (collectionutils.isempty(tokselexpr.getchildren())) { 169 return; 170 } 171 for (node child : tokselexpr.getchildren()) { 172 astnode cc = (astnode) child; 173 if (cc.gettoken().gettype() == tok_table_or_col) { 174 astnode ccc = (astnode) cc.getparent().getchild(cc.getparent().getchildcount() - 1); 175 string[] item; 176 if (ccc.gettoken().gettype() == tok_table_or_col) { 177 item = new string[]{cc.getchild(0).gettext(), ""}; 178 } else { 179 item = new string[]{ccc.gettext(), cc.getchild(0).gettext()}; 180 } 181 optional<string[]> any = column.getsourcelist().stream().filter(s -> arrays.equals(item, s)).findany(); 182 if (!any.ispresent()) { 183 column.getsourcelist().add(item); 184 } 185 continue; 186 } 187 parsecolumn(cc, column); 188 } 189 } 190 191 /** 192 * 插入信息 193 * 194 * @param cn 195 */ 196 private void inserttable(astnode cn) { 197 if (collectionutils.isempty(cn.getchildren())) { 198 return; 199 } 200 for (node child : cn.getchildren()) { 201 astnode cc = (astnode) child; 202 switch (cc.gettoken().gettype()) { 203 case tok_tabname: 204 string tablename = joiner.on(".").join(cc.getchildren().stream().map(n -> ((astnode) n).gettext()).collect(collectors.tolist())); 205 inserttables.add(tablename); 206 break; 207 case tok_partval: 208 if (cc.getchildcount() == 2) { 209 partitionmap.put(cc.getchild(0).gettext(), cc.getchild(1).gettext()); 210 } else { 211 partitionmap.put(cc.getchild(0).gettext(), null); 212 } 213 break; 214 default: 215 } 216 inserttable(cc); 217 } 218 } 219 220 /** 221 * 最外层列 222 * 223 * @param cn 224 */ 225 private void outermostcolumns(astnode cn) { 226 if (collectionutils.isempty(cn.getchildren())) { 227 return; 228 } 229 for (node cnchild : cn.getchildren()) { 230 astnode cc = (astnode) cnchild; 231 if (cc.gettoken().gettype() == tok_selexpr) { 232 string alias = getselexpralias(cc); 233 outermostcolumns.add(alias); 234 continue; 235 } 236 outermostcolumns(cc); 237 } 238 } 239 240 /** 241 * 列别名获取 242 * 243 * @param cc 244 * @return 245 */ 246 private string getselexpralias(astnode cc) { 247 astnode child = (astnode) cc.getchild(cc.getchildcount() - 1); 248 if (child.gettoken().gettype() == tok_table_or_col || child.gettoken().gettype() == dot) { 249 return child.getchild(child.getchildcount() - 1).gettext(); 250 } else { 251 return child.gettext(); 252 } 253 } 254 255 public list<string> getoutermostcolumns() { 256 return outermostcolumns; 257 } 258 259 public list<string> getsourcetables() { 260 return sourcetable.stream().map(t -> t[0]).distinct().collect(collectors.tolist()); 261 } 262 263 public string getinserttable() { 264 return collectionutils.isnotempty(inserttables) ? inserttables.get(0) : null; 265 } 266 267 public map<string, string> getpartition() { 268 return partitionmap; 269 } 270 271 public hivetableparseinfo gettableparseinfo() { 272 return tableparseinfo; 273 } 274 275 public static void main(string[] args) { 276 string sql23 = "insert overwrite table risk_event partition(year='2019',dt) select t.ops as order_no,t.id_no ,concat(t.consumer_no,'aa') dd,aadx from (select concat(a.opt_id,b.opt_id) as ops,b.id_no from ods.arc_event a left outer join ods.arc_user b on a.consumer_no = b.consumer_no) t left outer join (select order_no from arc_verify where dt = '20191023') t1 on t.consumer_no = t1.consumer_no"; 277 // string sql23 = "insert overwrite table riskt_eventpartition select opt_id from arc_event a inner join arc_user b"; 278 // string sql23 = "insert overwrite table riskt_eventpartition select opt_id from arc_event"; 279 // string sql23 = "select sum(case when rcw.eventid=2 and rcw.method = 'sendevent' then 1 else 0 end) as successcnt," + 280 // " sum(case when rcw.eventid=4 and rcw.method = 'risklevel' then 1 else 0 end) as paycnt," + 281 // " sum(case when rcw.eventid=2 and rcw.method = 'sendevent' then 1 else 0 end)/sum(case when rcw.eventid=4 and rcw.method = 'risklevel' then 1 else 0 end) as rate" + 282 // " from (\n" + 283 // " select distinct payorderid," + 284 // " eventid," + 285 // " method" + 286 // " from log.pay_rc_warden_event_basic" + 287 // " where dt = '20180715'" + 288 // " ) rcw"; 289 hivesqlparse hivesqlparse = new hivesqlparse(sql23); 290 system.out.println(hivesqlparse.getsourcetables()); 291 system.out.println(hivesqlparse.getoutermostcolumns()); 292 system.out.println(hivesqlparse.getinserttable()); 293 system.out.println(hivesqlparse.getpartition()); 294 system.out.println(hivesqlparse.gettableparseinfo()); 295 } 296 }
上一篇: 去复印店复印身份证......
下一篇: 北方的元宵叫什么?在南方叫做汤圆