Skip to content

Commit

Permalink
Grouping and aggregation (#1135)
Browse files Browse the repository at this point in the history
This PR provides experimental support for SQL queries with grouping and aggregation. These require the mixing normaliser (mixing_norm=on in the configuration file).

The result of grouping over a relation is represented as a finite map, which in Links is treated as a list of (grouping key, associated subrelation) pairs. Aggregation can then be applied groupwise to a finite map to obtain again a relation. Such Links queries are translated to SQL queries using group by and aggregates.

The following operations are provided in the prelude and can be used in grouping queries:

```
groupBy;
fun : ((a) -b-> c, [a]) -b-> [(c, [a])]

```
`groupBy(f, l)` takes a list `l` and a grouping criterion `f`: it applies `f` to each item in the list to obtain its grouping key, and finally returns a finite map, i.e. a list of pairs associating to each grouping key the list of items of the original list `l` that share the same grouping key.

e.g. 
`groupBy(fun (x) { (is_even = even(x)) }, [1, 2, 3]) = [((is_even = true), [2]), ((is_even = false), [1,3])]`

```
concatMapKey;
fun : ((a) -b-> [c], [(a, [_])]) -b-> [c]

```
`concatMapKey` works like `concatMap`, but it takes as input a finite map rather than any list; it performs comprehension over the (deduplicated) key set of the input map (for this reason, the output type of the finite map argument is irrelevant).

```
lookupG;
fun : (a, [(a, [b])]) -> [b]

```
Given a key `k` and a finite map `m`, `lookupG(k,m)` returns the list of values associated by the map `m` to the key `k`.

```
aggBy;
fun : ([(a, [b])], ([b]) -c-> d) -c-> [(a, d)]

```
`aggBy(m, f)` takes a finite map `m` and an aggregation criterion `f`: its purpose is to apply the aggregation to the input map on a key by key basis
This is where our LINQ infrastructure gets hacky: we can only support f when it is in the form:
`fun (t) { (outlabel1 = agg1(for (x <- t) [x.inlabel1]), ..., outlabeln = aggn(for (x <- t) [x.inlabeln])) }`

Where `agg1, ... aggn` are certain aggregation functions defined in the prelude:
`sum, sumf, avg, avgF, min_list, minF_list, max_list, maxF_list`

Any other use of `aggBy` will NOT be databaseable.

Example queries can be found in the database/grouping.links test file (which requires tables created by database/grouping-create.links)
  • Loading branch information
wricciot authored Oct 6, 2023
1 parent 5cfca65 commit 35ecc43
Show file tree
Hide file tree
Showing 27 changed files with 1,179 additions and 263 deletions.
16 changes: 10 additions & 6 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,15 +1,19 @@
# Unreleased
# Unreleased (scheduled for 0.9.8)

List of changes since the latest release.

## Queries mixing set and bag semantics
Links now provides experimental support for SQL queries with grouping and aggregation. These require the _mixing_ normaliser (`mixing_norm=on` in the configuration file).

The result of grouping over a relation is represented as a finite map, which in Links is treated as a list of (grouping key, associated subrelation) pairs. Aggregation can then be applied groupwise to a finite map to obtain again a relation. Such Links queries are translated to SQL queries using `group by` and aggregates.

Further information on this feature is provided in the [Links GitHub wiki](https://github.com/links-lang/links/wiki/Grouping-and-aggregation).

## Other changes and fixes
* The package `links-mysql`, based on the `mysql` opam package is no
longer supported. Instead, the package `links-mysql8`, based on the
`mysql8` package is provided, which also supports version of MySQL
prior to 8.

# 0.9.8

This release ...

* Control-flow linearity: Links now tracks control-flow linearity when
the flag `--control-flow-linearity` is enabled. This extension fixes
a long-standing soundness bug (see issue
Expand Down
6 changes: 4 additions & 2 deletions core/evalir.ml
Original file line number Diff line number Diff line change
Expand Up @@ -775,7 +775,7 @@ struct
begin
match evaluator e with
| None -> computation env cont e
| Some (db, q, t) ->
| Some (db, q, t, readback) ->
let q = db#string_of_query ~range q in
let (fieldMap, _, _) =
let r, _ = Types.unwrap_row (TypeUtils.extract_row t) in
Expand All @@ -790,7 +790,9 @@ struct
fieldMap
[]
in
apply_cont cont env (Database.execute_select fields q db)
Database.execute_select fields q db
|> readback (* unflattens records/finite maps *)
|> apply_cont cont env
end
end
| TemporalJoin (tmp, e, _t) ->
Expand Down
2 changes: 2 additions & 0 deletions core/json.ml
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,8 @@ let rec jsonize_value' : Value.t -> Yojson.Basic.t =
| #Value.primitive_value as p -> jsonize_primitive p
| `Variant (label, value) ->
lit ~tag:"Variant" [("_label", `String label); ("_value", jsonize_value' value)]
| `Entry (key, value) ->
lit ~tag:"MapEntry" [("_key", jsonize_value' key); ("_value", jsonize_value' value)]
| `Record fields ->
lit ~tag:"Record" (List.map (fun (k, v) -> (k, jsonize_value' v )) fields)
| `List l -> cons_listify (List.map jsonize_value' l)
Expand Down
25 changes: 25 additions & 0 deletions core/lib.ml
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,26 @@ let env : (string * (located_primitive * Types.datatype * pure)) list = [
"^.", float_op ( ** ) PURE;
"^^", string_op ( ^ ) PURE;

"max_int",
(Value.box_int max_int,
datatype "Int",
PURE);

"min_int",
(Value.box_int min_int,
datatype "Int",
PURE);

"infinity",
(Value.box_float Float.infinity,
datatype "Float",
PURE);

"neg_infinity",
(Value.box_float Float.neg_infinity,
datatype "Float",
PURE);

(* Comparisons *)
"==",
(p2 (fun v1 v2 -> Value.box_bool (equal v1 v2)),
Expand Down Expand Up @@ -592,6 +612,11 @@ let env : (string * (located_primitive * Types.datatype * pure)) list = [
datatype "([a]) -> Int",
PURE);

"Sum",
(p1 (Value.unbox_list ->- List.fold_left (fun x y -> x + Value.unbox_int y) 0 ->- Value.box_int),
datatype "([Int]) -> Int",
PURE);

"take",
(p2 (fun n l ->
Value.box_list (Utility.take (Value.unbox_int n) (Value.unbox_list l))),
Expand Down
23 changes: 14 additions & 9 deletions core/query/delateralize.ml
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,19 @@ let graph_query (q1,ty1) x (q2,ty2) =
let y = Var.fresh_raw_var () in
let p = Q.flattened_pair (QL.Var (x,ty1)) (QL.Var (y,ty2)) in
let ftys = Q.flattened_pair_ft (QL.Var (x,ty1)) (QL.Var (y,ty2)) in
QL.For (None, [(x, q1); (y, q2)], [], QL.Singleton p), ftys
QL.For (None, [(QL.Entries, x, q1); (QL.Entries, y, q2)], [], QL.Singleton p), ftys

(*
DELATERALIZING REWRITE for Prom:
for gs, y <- Prom(q3) do q1 -- s.t. x <- q2 in gs
~> for gs, p <- Prom(G(x <- Dedup q2; q3))
where x = p.1 do (\lambda y.q1) p.2
there's a similar rewrite for key comprehension, but the bag promotion is implicit:
this is why we require a genkind parameter
*)
let prom_delateralize gs q1 x (q2,ty2) y (q3,ty3) =
let rew_delateralize genkind gs q1 x (q2,ty2) y (q3,ty3) =
let cast x = match genkind with QL.Entries -> QL.Prom x | _ -> x in
let p = Var.fresh_raw_var () in
let graph, ftys = graph_query (QL.Dedup q2,ty2) x (q3,ty3) in
let vp = QL.Var (p,Types.make_record_type ftys) in
Expand Down Expand Up @@ -58,7 +62,7 @@ let prom_delateralize gs q1 x (q2,ty2) y (q3,ty3) =
let q1_rp = QL.subst q1 y rp
in
QL.For (None,
gs @ [(p, QL.Prom graph)],
gs @ [(QL.Entries, p, cast graph)],
[],
QL.If (eq_query, q1_rp, QL.nil))

Expand All @@ -70,23 +74,24 @@ let rec delateralize_step q =
match q with
| QL.For (_tag, gs, os, q) ->
let rec findgs gsx = function
| (y,QL.Prom qy as gy)::gsy ->
| (QL.Entries as genkind, y,QL.Prom qy as gy)::gsy
| (QL.Keys as genkind, y, qy as gy)::gsy ->
begin
match QL.occurs_free_gens gsx qy with
(* tail-consing is annoying, but occurs_free_list needs arguments in this order *)
| None -> findgs (gsx@[gy]) gsy
| Some (x,qx,tyx) -> Some (gsx,x,qx,tyx,y,qy,gsy)
| Some (x,qx,tyx) -> Some (gsx,x,qx,tyx,genkind,y,qy,gsy)
end
| gy::gsy -> findgs (gy::gsx) gsy
| [] -> None
in begin
match findgs [] gs with
| Some (gsx,x,qx,tyx,y,qy,gsy) ->
| Some (gsx,x,qx,tyx,gky,y,qy,gsy) ->
let qf = QL.For (None, gsy, [], q) in
let tyy = Q.type_of_for_var qy in
Some (prom_delateralize gsx qf x (qx,tyx) y (qy,tyy))
let tyy = Q.type_of_for_var gky qy in
Some (rew_delateralize gky gsx qf x (qx,tyx) y (qy,tyy))
| None ->
let ogs = gs >>==? (fun (z,qz) -> ds qz >>=? fun qz' -> Some (z,qz')) in
let ogs = gs >>==? (fun (genkind, z,qz) -> ds qz >>=? fun qz' -> Some (genkind,z,qz')) in
let oq = ds q in
begin
match ogs, oq with
Expand Down
123 changes: 107 additions & 16 deletions core/query/evalMixingQuery.ml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ let eval_error fmt : 'r =
let mapstrcat sep f l = l |> List.map f |> String.concat sep

let dummy_sql_empty_query =
(S.All,S.Fields [(S.Constant (Constant.Int 42), "@unit@")], [], S.Constant (Constant.Bool false), [])
(S.All,S.Fields [(S.Constant (Constant.Int 42), "@unit@")], [], S.Constant (Constant.Bool false), [], [])

let dependency_of_contains_free = function true -> S.Lateral | _ -> S.Standard

Expand All @@ -36,36 +36,93 @@ and disjunct is_set = function
| QL.Prom p -> sql_of_query S.Distinct p
| QL.Singleton _ as j -> S.Select (body is_set [] [] j)
| QL.For (_, gs, os, j) -> S.Select (body is_set gs os j)
| _arg -> Debug.print ("error in EvalMixingQuery.disjunct: unexpected arg = " ^ QL.show _arg); failwith "disjunct"
| QL.AggBy (ar, q) -> aggregator ar q
| _arg ->
Debug.print ("error in EvalMixingQuery.disjunct: unexpected arg = " ^ QL.show _arg);
failwith "disjunct"

and aggregator ar q =
let aggr = function
| QL.Primitive "Sum" -> "sum"
| QL.Primitive "SumF" -> "sum"
| QL.Primitive "Avg" -> "avg"
| QL.Primitive "AvgF" -> "avg"
| QL.Primitive "Min" -> "min"
| QL.Primitive "Max" -> "max"
| QL.Primitive "MinF" -> "min"
| QL.Primitive "MaxF" -> "max"
| QL.Primitive "length" -> "count"
| _ -> assert false
in
let z = Var.fresh_raw_var () in
let tyk, _tyv = q |> QL.type_of_expression |> Types.unwrap_map_type in
let fsk, _, _ = tyk |> Types.extract_row |> Types.extract_row_parts in
let fields_k = fsk |> StringMap.to_alist |> List.map (fun (f,_) -> S.Project (z, "1@" ^ f), "1@" ^ f) in
let fields_v = ar |> StringMap.to_alist |> List.map (fun (f_out, (aggfun, f_in)) ->
S.Apply (aggr aggfun, [S.Project (z, "2@" ^ f_in)]), "2@" ^ f_out)
in
let fields = fields_k @ fields_v in
let gbys = List.map (fun (_,f) -> S.Project (z, f)) fields_k in
S.Select (S.All, S.Fields fields, [S.Subquery (S.Standard, sql_of_query S.All q, z)], S.Constant (Constant.Bool true), gbys, [])

and generator locvars = function
| (v, QL.Prom p) -> (S.Subquery (dependency_of_contains_free (E.contains_free locvars p), sql_of_query S.Distinct p, v))
| (v, QL.Table Value.Table.{ name; _}) -> (S.TableRef (name, v))
| (v, QL.Dedup (QL.Table Value.Table.{ name; _ })) ->
S.Subquery (S.Standard, S.Select (S.Distinct, S.Star, [S.TableRef (name, v)], S.Constant (Constant.Bool true), []), v)
| (_, _arg) -> Debug.print ("error in EvalMixingQuery.disjunct: unexpected arg = " ^ QL.show _arg); failwith "generator"
| (QL.Entries, v, QL.Prom p) -> (S.Subquery (dependency_of_contains_free (E.contains_free locvars p), sql_of_query S.Distinct p, v))
| (QL.Entries, v, QL.Table Value.Table.{ name; _}) -> (S.TableRef (name, v))
| (QL.Entries, v, QL.Dedup (QL.Table Value.Table.{ name; _ })) ->
S.Subquery (S.Standard, S.Select (S.Distinct, S.Star, [S.TableRef (name, v)], S.Constant (Constant.Bool true), [], []), v)
| (QL.Keys, v, QL.GroupBy ((x, QL.Record gc), QL.Table Value.Table.{ name; _}))
| (QL.Keys, v, QL.GroupBy ((x, QL.Record gc), QL.Dedup (QL.Table Value.Table.{ name; _}))) ->
let fields = List.map (fun (f,e) -> (base_exp e, f)) (StringMap.to_alist gc) in
S.Subquery (dependency_of_contains_free (E.contains_free locvars (QL.Record gc)),
S.Select (S.Distinct, S.Fields fields, [S.TableRef (name, x)], S.Constant (Constant.Bool true), [], []), v)
| (QL.Keys, v, q) ->
let z = Var.fresh_raw_var () in
let tyk, _ = q |> QL.type_of_expression |> Types.unwrap_map_type in
let fsk, _, _ = tyk |> Types.extract_row |> Types.extract_row_parts in
let fields =
fsk
|> StringMap.to_alist
|> List.map (fun (f,_) -> S.Project (z, "1@" ^ f), f)
in
S.Subquery (dependency_of_contains_free (E.contains_free locvars q),
S.Select (S.Distinct,
S.Fields fields,
[S.Subquery (S.Standard, sql_of_query S.All q, z)],
S.Constant (Constant.Bool true),
[], []), v)
| (_genkind, _, _arg) -> Debug.print ("error in EvalMixingQuery.disjunct: unexpected arg = " ^ QL.show _arg); failwith "generator"

and body is_set gs os j =
let selquery body where =
let froms =
gs
|> List.fold_left (fun (locvars,acc) (v,_q as g) -> (v::locvars, generator locvars g::acc)) ([],[])
|> List.fold_left (fun (locvars,acc) (_genkind, v,_q as g) -> (v::locvars, generator locvars g::acc)) ([],[])
|> snd
|> List.rev
in
let os = List.map base_exp os in
(is_set, S.Fields body, froms, where, os)
(is_set, S.Fields body, froms, where, [], os)
in
match j with
| QL.Concat [] -> dummy_sql_empty_query
| QL.Singleton (QL.Record fields) ->
selquery
<| List.map (fun (f,x) -> (base_exp x, f)) (StringMap.to_alist fields)
<| Sql.Constant (Constant.Bool true)
| QL.Singleton (QL.MapEntry (QL.Record keys, QL.Record values)) ->
selquery
<| List.map (fun (f,x) -> (base_exp x, "1@" ^ f)) (StringMap.to_alist keys)
@ List.map (fun (f,x) -> (base_exp x, "2@" ^ f)) (StringMap.to_alist values)
<| Sql.Constant (Constant.Bool true)
| QL.If (c, QL.Singleton (QL.Record fields), QL.Concat []) ->
selquery
<| List.map (fun (f,x) -> (base_exp x, f)) (StringMap.to_alist fields)
<| base_exp c
| QL.If (c, QL.Singleton (QL.MapEntry (QL.Record keys, QL.Record values)), QL.Concat []) ->
selquery
<| List.map (fun (f,x) -> (base_exp x, "1@" ^ f)) (StringMap.to_alist keys)
@ List.map (fun (f,x) -> (base_exp x, "2@" ^ f)) (StringMap.to_alist values)
<| base_exp c
| _ -> Debug.print ("error in EvalMixingQuery.body: unexpected j = " ^ QL.show j); failwith "body"

and base_exp = function
Expand All @@ -91,7 +148,16 @@ and base_exp = function
Sql.Apply ("RLIKE", [base_exp s; r])
end
| QL.Apply (QL.Primitive "Empty", [v]) -> S.Empty (sql_of_query S.All v)
(* length takes as input a collection of records so it cannot be converted to S.Aggr *)
| QL.Apply (QL.Primitive "length", [v]) -> S.Length (sql_of_query S.All v)
| QL.Apply (QL.Primitive "Sum", [v]) -> S.Aggr ("sum", sql_of_query S.All v)
| QL.Apply (QL.Primitive "SumF", [v]) -> S.Aggr ("sum", sql_of_query S.All v)
| QL.Apply (QL.Primitive "Avg", [v]) -> S.Aggr ("avg", sql_of_query S.All v)
| QL.Apply (QL.Primitive "AvgF", [v]) -> S.Aggr ("avg", sql_of_query S.All v)
| QL.Apply (QL.Primitive "Min", [v]) -> S.Aggr ("min", sql_of_query S.All v)
| QL.Apply (QL.Primitive "MinF", [v]) -> S.Aggr ("min", sql_of_query S.All v)
| QL.Apply (QL.Primitive "Max", [v]) -> S.Aggr ("max", sql_of_query S.All v)
| QL.Apply (QL.Primitive "MaxF", [v]) -> S.Aggr ("max", sql_of_query S.All v)
| QL.Apply (QL.Primitive f, vs) -> S.Apply (f, List.map base_exp vs)
| QL.Constant c -> S.Constant c
| e ->
Expand All @@ -101,7 +167,7 @@ and base_exp = function
(* external call will start with a bag query *)
let sql_of_query = sql_of_query S.All

let compile_mixing : delateralize:QueryPolicy.t -> Value.env -> (int * int) option * Ir.computation -> (Value.database * Sql.query * Types.datatype) option =
let compile_mixing : delateralize:QueryPolicy.t -> Value.env -> (int * int) option * Ir.computation -> (Value.database * Sql.query * Types.datatype * (Value.t -> Value.t)) option =
fun ~delateralize env (range, e) ->
(* Debug.print ("env: "^Value.show_env env);
Debug.print ("e: "^Ir.show_computation e); *)
Expand All @@ -116,9 +182,34 @@ let compile_mixing : delateralize:QueryPolicy.t -> Value.env -> (int * int) opti
match QL.used_database v with
| None -> None
| Some db ->
let t = Types.unwrap_list_type (QL.type_of_expression v) in
(* Debug.print ("Generated NRC query: " ^ QL.show v ); *)
let q = sql_of_query v in
let _range = None in
(* Debug.print ("Generated SQL query: "^(Sql.string_of_query db _range q)); *)
Some (db, q, t)
let strip_presence = function Types.Present t -> t | _ -> assert false in
let v_flat = QL.FlattenRecords.flatten_query v in
(*
Debug.print ("Generated NRC query: " ^ QL.show v);
Debug.print ("Flattened NRC query: " ^ QL.show v_flat);
*)
let readback = QL.FlattenRecords.unflatten_query (QL.type_of_expression v) in
(* the calling code expects the item type, not the list type *)
let t_flat = Types.unwrap_list_type (QL.type_of_expression v_flat) in
let t_flat =
try
let tyk, tyv = Types.unwrap_mapentry_type t_flat in
let rowk, _, _ = tyk |> Types.extract_row |> Types.extract_row_parts in
let rowv, _, _ = tyv |> Types.extract_row |> Types.extract_row_parts in
let row = StringMap.fold
<| (fun k v acc -> StringMap.add ("1@" ^ k) (strip_presence v) acc)
<| rowk
<| StringMap.empty
in
let row = StringMap.fold
<| (fun k v acc -> StringMap.add ("2@" ^ k) (strip_presence v) acc)
<| rowv
<| row
in
Types.make_record_type row
with _ -> t_flat
in
let q = sql_of_query v_flat in
let _range = None in
Debug.print ("Generated SQL query: "^(db#string_of_query ~range:_range q));
Some (db, q, t_flat, readback)
15 changes: 10 additions & 5 deletions core/query/evalNestedQuery.ml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ let tag_query : QL.t -> QL.t =
If (tag c, tag t, tag e)
| Table t -> Table t
| Singleton e -> Singleton (tag e)
| MapEntry (k, v) -> MapEntry (tag k, tag v)
| Concat es ->
Concat (List.map tag es)
| Dedup t -> Dedup (tag t)
Expand All @@ -39,6 +40,10 @@ let tag_query : QL.t -> QL.t =
| Var (x, t) -> Var (x, t)
| Constant c -> Constant c
| Database db -> Database db
| GroupBy ((x,k), q) -> GroupBy ((x,tag k), tag q)
(* XXX: defensive programming: recursion on ar not needed now, but might be in the future *)
| AggBy (ar, q) -> AggBy (StringMap.map (fun (x,y) -> tag x, y) ar, tag q)
| Lookup (q,k) -> Lookup (tag q, tag k)
in
tag e

Expand Down Expand Up @@ -260,7 +265,7 @@ end
*)
module Split =
struct
type gen = Var.var * QL.t
type gen = QL.genkind * Var.var * QL.t

let rec query : gen list -> QL.t list -> QL.t -> QL.t -> QL.t list =
fun gs os cond ->
Expand Down Expand Up @@ -304,7 +309,7 @@ struct
[@@deriving show]

type cond = QL.t option
type gen = Var.var * QL.t
type gen = QL.genkind * Var.var * QL.t

let where c e =
match c with
Expand Down Expand Up @@ -446,7 +451,7 @@ struct
let rec lins c : let_clause =
let gs_out = List.concat (init (gens c)) in

let ys = List.map fst gs_out in
let ys = List.map (fun (_,x,_) -> x) gs_out in

let x_out =
List.fold_right
Expand All @@ -460,7 +465,7 @@ struct

let r_out =
tuple (List.map
(fun (x, source) ->
(fun (_genkind, x, source) ->
match source with
| QL.Table t ->
let tyx = Types.make_record_type (QL.table_field_types t) in
Expand All @@ -470,7 +475,7 @@ struct
let r_out_type =
Types.make_tuple_type
(List.map
(fun (_, source) ->
(fun (_genkind,_, source) ->
match source with
| QL.Table Value.Table.{ row; _ } ->
Types.Record (Types.Row row)
Expand Down
Loading

0 comments on commit 35ecc43

Please sign in to comment.