PostgreSQL 优化器的初步分析:query_planner()
PostgreSQL 优化器的初步分析:
query_planner():
query_planner() 是优化器生成 path 的基础,所以首先从这里开始分析起。
query_planner() 负责生成一个基本的 path,这个 path 除了基本的扫描表的操作外,最后最多只包含连接操作,剩下的其他诸如聚合或者 limit 等操作,则会到更外层的 grouping_planner() 中进行。
进入 query_planner() 第一步就是对每个基本表生成 RelOptInfo 结构:
基本表就是 FROM 后指定的一个普通的表,而不是由连接等操作生成的表。 在优化生成 plan 的过程中,RelOptInfo 中包含了对一个表所需的信息;
typedef struct RelOptInfo
{
NodeTag type;
RelOptKind reloptkind;
/* all relations included in this RelOptInfo */
Relids relids; /* set of base relids (rangetable indexes) */
/* size estimates generated by planner */
double rows; /* estimated number of result tuples */
/* per-relation planner control flags */
bool consider_startup; /* keep cheap-startup-cost paths? */
bool consider_param_startup; /* ditto, for parameterized paths? */
bool consider_parallel; /* consider parallel paths? */
/* default result targetlist for Paths scanning this relation */
struct PathTarget *reltarget; /* list of Vars/Exprs, cost, width */
/* materialization information */
List *pathlist; /* Path structures */
List *ppilist; /* ParamPathInfos used in pathlist */
List *partial_pathlist; /* partial Paths */
struct Path *cheapest_startup_path;
struct Path *cheapest_total_path;
struct Path *cheapest_unique_path;
List *cheapest_parameterized_paths;
/* parameterization information needed for both base rels and join rels */
/* (see also lateral_vars and lateral_referencers) */
Relids direct_lateral_relids; /* rels directly laterally referenced */
Relids lateral_relids; /* minimum parameterization of rel */
/* information about a base rel (not set for join rels!) */
Index relid;
Oid reltablespace; /* containing tablespace */
RTEKind rtekind; /* RELATION, SUBQUERY, or FUNCTION */
AttrNumber min_attr; /* smallest attrno of rel (often <0) */
AttrNumber max_attr; /* largest attrno of rel */
Relids *attr_needed; /* array indexed [min_attr .. max_attr] */
int32 *attr_widths; /* array indexed [min_attr .. max_attr] */
List *lateral_vars; /* LATERAL Vars and PHVs referenced by rel */
Relids lateral_referencers; /* rels that reference me laterally */
List *indexlist; /* list of IndexOptInfo */
BlockNumber pages; /* size estimates derived from pg_class */
double tuples;
double allvisfrac;
PlannerInfo *subroot; /* if subquery */
List *subplan_params; /* if subquery */
int rel_parallel_workers; /* wanted number of parallel workers */
/* Information about foreign tables and foreign joins */
Oid serverid; /* identifies server for the table or join */
Oid userid; /* identifies user to check access as */
bool useridiscurrent; /* join is only valid for current user */
/* use "struct FdwRoutine" to avoid including fdwapi.h here */
struct FdwRoutine *fdwroutine;
void *fdw_private;
/* used by various scans and joins: */
List *baserestrictinfo; /* RestrictInfo structures (if base
* rel) */
QualCost baserestrictcost; /* cost of evaluating the above */
List *joininfo; /* RestrictInfo structures for join clauses
* involving this rel */
bool has_eclass_joins; /* T means joininfo is incomplete */
} RelOptInfo;
以上的 RelOptInfo 结构中的元素,在接下来优化的过程中会逐渐用到。
接下来还需要介绍 PlannerInfo 结构,该结构是整个优化器执行过程中信息存放的地方,所以每个生成的 RelOptInfo 都存放在 PlannerInfo 中:
/*----------
* PlannerInfo
* Per-query information for planning/optimization
*
* This struct is conventionally called "root" in all the planner routines.
* It holds links to all of the planner's working state, in addition to the
* original Query. Note that at present the planner extensively modifies
* the passed-in Query data structure; someday that should stop.
*----------
*/
typedef struct PlannerInfo
{
NodeTag type;
Query *parse; /* the Query being planned */
PlannerGlobal *glob; /* global info for current planner run */
Index query_level; /* 1 at the outermost Query */
struct PlannerInfo *parent_root; /* NULL at outermost Query */
/*
* plan_params contains the expressions that this query level needs to
* make available to a lower query level that is currently being planned.
* outer_params contains the paramIds of PARAM_EXEC Params that outer
* query levels will make available to this query level.
*/
List *plan_params; /* list of PlannerParamItems, see below */
Bitmapset *outer_params;
/*
* simple_rel_array holds pointers to "base rels" and "other rels" (see
* comments for RelOptInfo for more info). It is indexed by rangetable
* index (so entry 0 is always wasted). Entries can be NULL when an RTE
* does not correspond to a base relation, such as a join RTE or an
* unreferenced view RTE; or if the RelOptInfo hasn't been made yet.
*/
struct RelOptInfo **simple_rel_array; /* All 1-rel RelOptInfos */
int simple_rel_array_size; /* allocated size of array */
/*
* simple_rte_array is the same length as simple_rel_array and holds
* pointers to the associated rangetable entries. This lets us avoid
* rt_fetch(), which can be a bit slow once large inheritance sets have
* been expanded.
*/
RangeTblEntry **simple_rte_array; /* rangetable as an array */
先通过下面语句为每个要创建 RelOptInfo 的基本表申请空间:
一个 RelOptInfo 对应一个 RangeTableEntry,而这些 RTE 为了避免干扰,是从 Query 中的 jointree 中取出的。
setup_simple_rel_arrays(PlannerInfo *root)
{
Index rti;
ListCell *lc;
/* Arrays are accessed using RT indexes (1..N) */
/* 为 Query 中 RangeTable 中的每一个都创建一个 RelOptInfo,*/
/* 由于 RangeTable 是从 1 到 N 的,所以大小 + 1*/
root->simple_rel_array_size = list_length(root->parse->rtable) + 1;
/* simple_rel_array is initialized to all NULLs */
root->simple_rel_array = (RelOptInfo **)
palloc0(root->simple_rel_array_size * sizeof(RelOptInfo *));
/* simple_rte_array is an array equivalent of the rtable list */
/* RangeTable 中每一个 RangeTableEntry(RTE) 的快捷入口 */
root->simple_rte_array = (RangeTblEntry **)
palloc0(root->simple_rel_array_size * sizeof(RangeTblEntry *));
rti = 1;
foreach(lc, root->parse->rtable)
{
RangeTblEntry *rte = (RangeTblEntry *) lfirst(lc);
root->simple_rte_array[rti++] = rte;
}
}
然后在 add_base_rels_to_query() 中初始化每个 RelOptInfo:
add_base_rels_to_query(root, (Node *) parse->jointree);
从 Query.jointree 中而不是从 Query.rtable 中取出 RTE 可以仅取到不包含 Join 的基本表;
对于每个基本表进行下列操作:
/*
* build_simple_rel
* Construct a new RelOptInfo for a base relation or 'other' relation.
*/
RelOptInfo *
build_simple_rel(PlannerInfo *root, int relid, RelOptKind reloptkind)
{
RelOptInfo *rel;
RangeTblEntry *rte;
/* Rel should not exist already */
Assert(relid > 0 && relid < root->simple_rel_array_size);
if (root->simple_rel_array[relid] != NULL)
elog(ERROR, "rel %d already exists", relid);
/* Fetch RTE for relation */
rte = root->simple_rte_array[relid];
Assert(rte != NULL);
rel = makeNode(RelOptInfo);
rel->reloptkind = reloptkind;
rel->relids = bms_make_singleton(relid);
rel->rows = 0;
/* cheap startup cost is interesting iff not all tuples to be retrieved */
rel->consider_startup = (root->tuple_fraction > 0);
rel->consider_param_startup = false; /* might get changed later */
rel->consider_parallel = false; /* might get changed later */
rel->reltarget = create_empty_pathtarget();
rel->pathlist = NIL;
rel->ppilist = NIL;
rel->partial_pathlist = NIL;
rel->cheapest_startup_path = NULL;
rel->cheapest_total_path = NULL;
rel->cheapest_unique_path = NULL;
rel->cheapest_parameterized_paths = NIL;
rel->direct_lateral_relids = NULL;
rel->lateral_relids = NULL;
rel->relid = relid;
rel->rtekind = rte->rtekind;
/* min_attr, max_attr, attr_needed, attr_widths are set below */
rel->lateral_vars = NIL;
rel->lateral_referencers = NULL;
rel->indexlist = NIL;
rel->pages = 0;
rel->tuples = 0;
rel->allvisfrac = 0;
rel->subroot = NULL;
rel->subplan_params = NIL;
rel->rel_parallel_workers = -1; /* set up in GetRelationInfo */
rel->serverid = InvalidOid;
rel->userid = rte->checkAsUser;
rel->useridiscurrent = false;
rel->fdwroutine = NULL;
rel->fdw_private = NULL;
rel->baserestrictinfo = NIL;
rel->baserestrictcost.startup = 0;
rel->baserestrictcost.per_tuple = 0;
rel->joininfo = NIL;
rel->has_eclass_joins = false;
/* Check type of rtable entry */
switch (rte->rtekind)
{
case RTE_RELATION:
/* Table --- retrieve statistics from the system catalogs */
/* 在下面函数中通过查询 Catalog,将目前所针对建立 RelOptInfo 的 RTE 的下列信息项填入:
* min_attr lowest valid AttrNumber 最低有效属性号
* max_attr highest valid AttrNumber 最高有效属性号
* indexlist list of IndexOptInfos for relation's indexes
* serverid if it's a foreign table, the server OID 所属外部表服务器的 OID
* fdwroutine if it's a foreign table, the FDW function pointers 外部表处理函数指针
* pages number of pages 该 RTE 所对应的表的页数
* tuples number of tuples 该 RTE 所对应的表的元组数
*/
get_relation_info(root, rte->relid, rte->inh, rel);
break;
case RTE_SUBQUERY:
case RTE_FUNCTION:
case RTE_VALUES:
case RTE_CTE:
/*
* Subquery, function, or values list --- set up attr range and
* arrays
*
* Note: 0 is included in range to support whole-row Vars
*/
rel->min_attr = 0;
rel->max_attr = list_length(rte->eref->colnames);
rel->attr_needed = (Relids *)
palloc0((rel->max_attr - rel->min_attr + 1) * sizeof(Relids));
rel->attr_widths = (int32 *)
palloc0((rel->max_attr - rel->min_attr + 1) * sizeof(int32));
break;
default:
elog(ERROR, "unrecognized RTE kind: %d",
(int) rte->rtekind);
break;
}
接下来会对 Query 中的 Jointree 进行处理,Jointree 代表一个 SQL 语句中的 From/Where/Join/On 部分。
该部分的语句可能是连接操作,也可能是一些对表的限制条件(如:attr>2),所以在这里需要作一个区分的处理:
deconstruct_jointree() -> deconstruct_recurse() 中:
如果是针对某个表的限制条件,则调用 distribute_qual_to_rels() 将该限制条件下推到该表的 RelOptInfo->baserestrictinfo 中去;
如果是对一些表的连接操作,则先同样调用 distribute_qual_to_rels() 将连接的条件下推到该表的 RelOptInfo->joininfo 中去。同时,构造一个 RangeTblRef * 类型的链表,记录需要在一起连接的基本表。
deconstruct_jointree() 调用完成后,会返回一个上述的连接链表,同时针对每个基本表的扫描或连接限制条件,都存入了各自的 RelOptInfo 的 baserestrictinfo/joininfo 中,如下:
joinlist = deconstruct_jointree(root);
然后就可以利用所获取的信息,进行 Join 的构造了:
/*
* Ready to do the primary planning.
*/
final_rel = make_one_rel(root, joinlist);
在 make_one_rel() 中:
/*
* Compute size estimates and consider_parallel flags for each base rel,
* then generate access paths.
*/
set_base_rel_sizes(root); //估算该基本表的行数和宽度
set_base_rel_pathlists(root); //生成该基本表的 path
进入后,会根据基本表类型的不同,调用不同的函数处理,对于外部表,就是根据之前 RelOptInfo 中初始化的 fdwroutine 调用 fdw 部分的代码的,如下:
rel->fdwroutine->GetForeignRelSize(root, rel, rte->relid);
以及
rel->fdwroutine->GetForeignPaths(root, rel, rte->relid);
针对每个基本表生成 path 后,利用已经生成的 joinlist 构造一个 RELOPT_JOINREL 类型的 RelOptInfo 以及对应的 path,最后返回这个 Join 的 RelOptInfo。
/*
* Generate access paths for the entire join tree.
*/
rel = make_rel_from_joinlist(root, joinlist);
实例:
对于下列语句(其中 test1 和 test2 都是外部表),
select * from test1 left join test2 on test1.no=test2.no limit 3 offset 3;
query_planner() 不会处理其中的 “limit 3 offset 3” 那是在 grouping_planner 中之后做的事。
首先在 PlannerInfo 中初始化存储 RelOptInfo 的空间:
PlannerInfo.simple_rel_array_size = list_length(root->parse->rtable) + 1; // = 4
PlannerInfo.simple_rel_array = (RelOptInfo **)
palloc0(root->simple_rel_array_size * sizeof(RelOptInfo *));
由于 Query 中的 RangeTable 中会有 test1、test2 以及它们之间的 Join 存在,这里的大小为 4,也就是说最多之后可能会用到 4 个 RelOptInfo,但并不是一定会,而且因为 RangeTable 是从 1 开始计数的,第一个空间是不会被使用到。
针对 test1 和 test2 表生成对应的 RelOptInfo;
add_base_rels_to_query(root, (Node *) parse->jointree);
下面是 test1 的 RelOptInfo:
(gdb) p *(RelOptInfo *)root->simple_rel_array[1]
$32 = {
type = T_RelOptInfo,
reloptkind = RELOPT_BASEREL,
relids = 0x245c1d0,
rows = 0,
consider_startup = 1 '\001',
consider_param_startup = 0 '\000',
consider_parallel = 0 '\000',
reltarget = 0x245c1e8,
pathlist = 0x0,
ppilist = 0x0,
partial_pathlist = 0x0,
cheapest_startup_path = 0x0,
cheapest_total_path = 0x0,
cheapest_unique_path = 0x0,
cheapest_parameterized_paths = 0x0,
direct_lateral_relids = 0x0,
lateral_relids = 0x0,
relid = 1,
reltablespace = 0,
rtekind = RTE_RELATION,
min_attr = -7,
max_attr = 2,
attr_needed = 0x245c238,
attr_widths = 0x245c2c8,
lateral_vars = 0x0,
lateral_referencers = 0x0,
indexlist = 0x0,
pages = 0,
tuples = 0,
allvisfrac = 0,
subroot = 0x0,
subplan_params = 0x0,
rel_parallel_workers = -1,
serverid = 16399,
userid = 0,
useridiscurrent = 0 '\000',
fdwroutine = 0x245c318,
fdw_private = 0x0,
baserestrictinfo = 0x0,
baserestrictcost = {
startup = 0,
per_tuple = 0
},
joininfo = 0x0,
has_eclass_joins = 0 '\000'
}
处理 RestrictInfo,由于原句并未包含对表的 Where 限制,所以这里只会生成一个 joinlist;
joinlist = deconstruct_jointree(&PlannerInfo);
生成的 joinlist 是一个 RTE 链表,连接着 2 个节点,分别指向 test1 和 test2 的 RTE,表示将要进行 test1 和 test2 的连接操作;
(gdb) p *(RangeTblRef *)((List *)joinlist)->head->data->ptr_value
$29 = {
type = T_RangeTblRef,
rtindex = 1
}
(gdb) p *(RangeTblRef *)((List *)joinlist)->tail->data->ptr_value
$30 = {
type = T_RangeTblRef,
rtindex = 2
}
最后根据 joinlist 生成 join RelOptInfo:
final_rel = make_one_rel(root, joinlist);
final_rel = {
type = T_RelOptInfo,
reloptkind = RELOPT_JOINREL,
relids = 0x245f3b8,
rows = 9316,
consider_startup = 1 '\001',
consider_param_startup = 0 '\000',
consider_parallel = 0 '\000',
reltarget = 0x245f3d0,
pathlist = 0x245f9d0,
ppilist = 0x0,
partial_pathlist = 0x0,
cheapest_startup_path = 0x246b490,
cheapest_total_path = 0x245f8a0,
cheapest_unique_path = 0x0,
cheapest_parameterized_paths = 0x246b6d0,
direct_lateral_relids = 0x0,
lateral_relids = 0x0,
relid = 0,
reltablespace = 0,
rtekind = RTE_JOIN,
min_attr = 0,
max_attr = 0,
attr_needed = 0x0,
attr_widths = 0x0,
lateral_vars = 0x0,
lateral_referencers = 0x0,
indexlist = 0x0,
pages = 0,
tuples = 0,
allvisfrac = 0,
subroot = 0x0,
subplan_params = 0x0,
rel_parallel_workers = -1,
serverid = 16399,
userid = 0,
useridiscurrent = 0 '\000',
fdwroutine = 0x245c318,
fdw_private = 0x245fc40,
baserestrictinfo = 0x0,
baserestrictcost = {
startup = 0,
per_tuple = 0
},
joininfo = 0x0,
has_eclass_joins = 0 '\000'
}
在名为 final_rel 的 RelOptInfo 中的 pathlist 链表中生成了 3 个 path:
MergePath,先从远端取回数据后,在本地进行 MergeJoin 处理;
HashPath,先从远端取回数据后,在本地进行 HashJoin 处理;
MergePath 和 HashPath 的 outerjoinpath 和 innerjoinpath 则分别指向一个 ForeignPath,分别进行 test1 和 test2 的 ForeignScan。
ForeignPath,将连接语句也一并发送到远端执行,所以对于本地就相当于仅有一个 ForeignScan 即 ForeignPath。
MergePath = {
jpath = {
path = {
type = T_MergePath,
pathtype = T_MergeJoin,
parent = 0x245f1a8,
pathtarget = 0x245f3d0,
param_info = 0x0,
parallel_aware = 0 '\000',
parallel_safe = 0 '\000',
parallel_workers = 0,
rows = 9316,
startup_cost = 444.06045346876857,
total_cost = 590.62545346876846,
pathkeys = 0x0
},
jointype = JOIN_LEFT,
outerjoinpath = 0x245ee90,
innerjoinpath = 0x245efc0,
joinrestrictinfo = 0x245f4f0
},
path_mergeclauses = 0x245f730,
outersortkeys = 0x245f6e0,
innersortkeys = 0x245f7d0,
materialize_inner = 0 '\000'
}
HashPath = {
jpath = {
path = {
type = T_HashPath,
pathtype = T_HashJoin,
parent = 0x245f1a8,
pathtarget = 0x245f3d0,
param_info = 0x0,
parallel_aware = 0 '\000',
parallel_safe = 0 '\000',
parallel_workers = 0,
rows = 9316,
startup_cost = 268.01250000000005,
total_cost = 647.58500000000004,
pathkeys = 0x0
},
jointype = JOIN_LEFT,
outerjoinpath = 0x245ee90,
innerjoinpath = 0x245efc0,
joinrestrictinfo = 0x245f4f0
},
path_mergeclauses = 0x245fbf0,
outersortkeys = 0x1,
innersortkeys = 0x10,
materialize_inner = 32 ' '
}
ForeignPath = {
path = {
type = T_ForeignPath,
pathtype = T_ForeignScan,
parent = 0x245f1a8,
pathtarget = 0x245f3d0,
param_info = 0x0,
parallel_aware = 0 '\000',
parallel_safe = 0 '\000',
parallel_workers = 0,
rows = 9316,
startup_cost = 100,
total_cost = 4991.6824999999999,
pathkeys = 0x0
},
fdw_outerpath = 0x0,
fdw_private = 0x0
}