PostgreSQL 优化器的初步分析:

query_planner():

query_planner() 是优化器生成 path 的基础,所以首先从这里开始分析起。

query_planner() 负责生成一个基本的 path,这个 path 除了基本的扫描表的操作外,最后最多只包含连接操作,剩下的其他诸如聚合或者 limit 等操作,则会到更外层的 grouping_planner() 中进行。

进入 query_planner() 第一步就是对每个基本表生成 RelOptInfo 结构:

基本表就是 FROM 后指定的一个普通的表,而不是由连接等操作生成的表。 在优化生成 plan 的过程中,RelOptInfo 中包含了对一个表所需的信息;

typedef struct RelOptInfo
{
	NodeTag		type;

	RelOptKind	reloptkind;

	/* all relations included in this RelOptInfo */
	Relids		relids;			/* set of base relids (rangetable indexes) */

	/* size estimates generated by planner */
	double		rows;			/* estimated number of result tuples */

	/* per-relation planner control flags */
	bool		consider_startup;		/* keep cheap-startup-cost paths? */
	bool		consider_param_startup; /* ditto, for parameterized paths? */
	bool		consider_parallel;		/* consider parallel paths? */

	/* default result targetlist for Paths scanning this relation */
	struct PathTarget *reltarget;		/* list of Vars/Exprs, cost, width */

	/* materialization information */
	List	   *pathlist;		/* Path structures */
	List	   *ppilist;		/* ParamPathInfos used in pathlist */
	List	   *partial_pathlist;		/* partial Paths */
	struct Path *cheapest_startup_path;
	struct Path *cheapest_total_path;
	struct Path *cheapest_unique_path;
	List	   *cheapest_parameterized_paths;

	/* parameterization information needed for both base rels and join rels */
	/* (see also lateral_vars and lateral_referencers) */
	Relids		direct_lateral_relids;	/* rels directly laterally referenced */
	Relids		lateral_relids; /* minimum parameterization of rel */

	/* information about a base rel (not set for join rels!) */
	Index		relid;
	Oid			reltablespace;	/* containing tablespace */
	RTEKind		rtekind;		/* RELATION, SUBQUERY, or FUNCTION */
	AttrNumber	min_attr;		/* smallest attrno of rel (often <0) */
	AttrNumber	max_attr;		/* largest attrno of rel */
	Relids	   *attr_needed;	/* array indexed [min_attr .. max_attr] */
	int32	   *attr_widths;	/* array indexed [min_attr .. max_attr] */
	List	   *lateral_vars;	/* LATERAL Vars and PHVs referenced by rel */
	Relids		lateral_referencers;	/* rels that reference me laterally */
	List	   *indexlist;		/* list of IndexOptInfo */
	BlockNumber pages;			/* size estimates derived from pg_class */
	double		tuples;
	double		allvisfrac;
	PlannerInfo *subroot;		/* if subquery */
	List	   *subplan_params; /* if subquery */
	int			rel_parallel_workers;	/* wanted number of parallel workers */

	/* Information about foreign tables and foreign joins */
	Oid			serverid;		/* identifies server for the table or join */
	Oid			userid;			/* identifies user to check access as */
	bool		useridiscurrent;	/* join is only valid for current user */
	/* use "struct FdwRoutine" to avoid including fdwapi.h here */
	struct FdwRoutine *fdwroutine;
	void	   *fdw_private;

	/* used by various scans and joins: */
	List	   *baserestrictinfo;		/* RestrictInfo structures (if base
										 * rel) */
	QualCost	baserestrictcost;		/* cost of evaluating the above */
	List	   *joininfo;		/* RestrictInfo structures for join clauses
								 * involving this rel */
	bool		has_eclass_joins;		/* T means joininfo is incomplete */
} RelOptInfo;

以上的 RelOptInfo 结构中的元素,在接下来优化的过程中会逐渐用到。

接下来还需要介绍 PlannerInfo 结构,该结构是整个优化器执行过程中信息存放的地方,所以每个生成的 RelOptInfo 都存放在 PlannerInfo 中:

/*----------
 * PlannerInfo
 *		Per-query information for planning/optimization
 *
 * This struct is conventionally called "root" in all the planner routines.
 * It holds links to all of the planner's working state, in addition to the
 * original Query.  Note that at present the planner extensively modifies
 * the passed-in Query data structure; someday that should stop.
 *----------
 */
typedef struct PlannerInfo
{
	NodeTag		type;

	Query	   *parse;			/* the Query being planned */

	PlannerGlobal *glob;		/* global info for current planner run */

	Index		query_level;	/* 1 at the outermost Query */

	struct PlannerInfo *parent_root;	/* NULL at outermost Query */

	/*
	 * plan_params contains the expressions that this query level needs to
	 * make available to a lower query level that is currently being planned.
	 * outer_params contains the paramIds of PARAM_EXEC Params that outer
	 * query levels will make available to this query level.
	 */
	List	   *plan_params;	/* list of PlannerParamItems, see below */
	Bitmapset  *outer_params;

	/*
	 * simple_rel_array holds pointers to "base rels" and "other rels" (see
	 * comments for RelOptInfo for more info).  It is indexed by rangetable
	 * index (so entry 0 is always wasted).  Entries can be NULL when an RTE
	 * does not correspond to a base relation, such as a join RTE or an
	 * unreferenced view RTE; or if the RelOptInfo hasn't been made yet.
	 */
	struct RelOptInfo **simple_rel_array;		/* All 1-rel RelOptInfos */
	int			simple_rel_array_size;	/* allocated size of array */

	/*
	 * simple_rte_array is the same length as simple_rel_array and holds
	 * pointers to the associated rangetable entries.  This lets us avoid
	 * rt_fetch(), which can be a bit slow once large inheritance sets have
	 * been expanded.
	 */
	RangeTblEntry **simple_rte_array;	/* rangetable as an array */

先通过下面语句为每个要创建 RelOptInfo 的基本表申请空间:

一个 RelOptInfo 对应一个 RangeTableEntry,而这些 RTE 为了避免干扰,是从 Query 中的 jointree 中取出的。

  setup_simple_rel_arrays(PlannerInfo *root)
  {
    Index		rti;
    ListCell   *lc;

    /* Arrays are accessed using RT indexes (1..N) */
    /* 为 Query 中 RangeTable 中的每一个都创建一个 RelOptInfo,*/
    /* 由于 RangeTable 是从 1 到 N 的,所以大小 + 1*/
    root->simple_rel_array_size = list_length(root->parse->rtable) + 1;

    /* simple_rel_array is initialized to all NULLs */
    root->simple_rel_array = (RelOptInfo **)
        palloc0(root->simple_rel_array_size * sizeof(RelOptInfo *));

    /* simple_rte_array is an array equivalent of the rtable list */
    /* RangeTable 中每一个 RangeTableEntry(RTE) 的快捷入口 */
    root->simple_rte_array = (RangeTblEntry **)
        palloc0(root->simple_rel_array_size * sizeof(RangeTblEntry *));
    rti = 1;
    foreach(lc, root->parse->rtable)
    {
        RangeTblEntry *rte = (RangeTblEntry *) lfirst(lc);

        root->simple_rte_array[rti++] = rte;
    }
  }

然后在 add_base_rels_to_query() 中初始化每个 RelOptInfo:

add_base_rels_to_query(root, (Node *) parse->jointree);

从 Query.jointree 中而不是从 Query.rtable 中取出 RTE 可以仅取到不包含 Join 的基本表;

对于每个基本表进行下列操作:

/*
 * build_simple_rel
 *	  Construct a new RelOptInfo for a base relation or 'other' relation.
 */
RelOptInfo *
build_simple_rel(PlannerInfo *root, int relid, RelOptKind reloptkind)
{
	RelOptInfo *rel;
	RangeTblEntry *rte;

	/* Rel should not exist already */
	Assert(relid > 0 && relid < root->simple_rel_array_size);
	if (root->simple_rel_array[relid] != NULL)
		elog(ERROR, "rel %d already exists", relid);

	/* Fetch RTE for relation */
	rte = root->simple_rte_array[relid];
	Assert(rte != NULL);

	rel = makeNode(RelOptInfo);
	rel->reloptkind = reloptkind;
	rel->relids = bms_make_singleton(relid);
	rel->rows = 0;
	/* cheap startup cost is interesting iff not all tuples to be retrieved */
	rel->consider_startup = (root->tuple_fraction > 0);
	rel->consider_param_startup = false;		/* might get changed later */
	rel->consider_parallel = false;		/* might get changed later */
	rel->reltarget = create_empty_pathtarget();
	rel->pathlist = NIL;
	rel->ppilist = NIL;
	rel->partial_pathlist = NIL;
	rel->cheapest_startup_path = NULL;
	rel->cheapest_total_path = NULL;
	rel->cheapest_unique_path = NULL;
	rel->cheapest_parameterized_paths = NIL;
	rel->direct_lateral_relids = NULL;
	rel->lateral_relids = NULL;
	rel->relid = relid;
	rel->rtekind = rte->rtekind;
	/* min_attr, max_attr, attr_needed, attr_widths are set below */
	rel->lateral_vars = NIL;
	rel->lateral_referencers = NULL;
	rel->indexlist = NIL;
	rel->pages = 0;
	rel->tuples = 0;
	rel->allvisfrac = 0;
	rel->subroot = NULL;
	rel->subplan_params = NIL;
	rel->rel_parallel_workers = -1;		/* set up in GetRelationInfo */
	rel->serverid = InvalidOid;
	rel->userid = rte->checkAsUser;
	rel->useridiscurrent = false;
	rel->fdwroutine = NULL;
	rel->fdw_private = NULL;
	rel->baserestrictinfo = NIL;
	rel->baserestrictcost.startup = 0;
	rel->baserestrictcost.per_tuple = 0;
	rel->joininfo = NIL;
	rel->has_eclass_joins = false;
	/* Check type of rtable entry */
	switch (rte->rtekind)
	{
		case RTE_RELATION:
        /* Table --- retrieve statistics from the system catalogs */
        /* 在下面函数中通过查询 Catalog,将目前所针对建立 RelOptInfo 的 RTE 的下列信息项填入:
         *	min_attr	lowest valid AttrNumber 最低有效属性号
         *	max_attr	highest valid AttrNumber 最高有效属性号
         *	indexlist	list of IndexOptInfos for relation's indexes
         *	serverid	if it's a foreign table, the server OID 所属外部表服务器的 OID
         *	fdwroutine	if it's a foreign table, the FDW function pointers 外部表处理函数指针
         *	pages		number of pages 该 RTE 所对应的表的页数
         *	tuples		number of tuples 该 RTE 所对应的表的元组数
		*/
			get_relation_info(root, rte->relid, rte->inh, rel);
			break;
		case RTE_SUBQUERY:
		case RTE_FUNCTION:
		case RTE_VALUES:
		case RTE_CTE:

			/*
			 * Subquery, function, or values list --- set up attr range and
			 * arrays
			 *
			 * Note: 0 is included in range to support whole-row Vars
			 */
			rel->min_attr = 0;
			rel->max_attr = list_length(rte->eref->colnames);
			rel->attr_needed = (Relids *)
				palloc0((rel->max_attr - rel->min_attr + 1) * sizeof(Relids));
			rel->attr_widths = (int32 *)
				palloc0((rel->max_attr - rel->min_attr + 1) * sizeof(int32));
			break;
		default:
			elog(ERROR, "unrecognized RTE kind: %d",
				 (int) rte->rtekind);
			break;
	}

接下来会对 Query 中的 Jointree 进行处理,Jointree 代表一个 SQL 语句中的 From/Where/Join/On 部分。

该部分的语句可能是连接操作,也可能是一些对表的限制条件(如:attr>2),所以在这里需要作一个区分的处理:

deconstruct_jointree() -> deconstruct_recurse() 中:

如果是针对某个表的限制条件,则调用 distribute_qual_to_rels() 将该限制条件下推到该表的 RelOptInfo->baserestrictinfo 中去;

如果是对一些表的连接操作,则先同样调用 distribute_qual_to_rels() 将连接的条件下推到该表的 RelOptInfo->joininfo 中去。同时,构造一个 RangeTblRef * 类型的链表,记录需要在一起连接的基本表。

deconstruct_jointree() 调用完成后,会返回一个上述的连接链表,同时针对每个基本表的扫描或连接限制条件,都存入了各自的 RelOptInfo 的 baserestrictinfo/joininfo 中,如下:

joinlist = deconstruct_jointree(root);

然后就可以利用所获取的信息,进行 Join 的构造了:

/*
	 * Ready to do the primary planning.
	 */
	final_rel = make_one_rel(root, joinlist);

在 make_one_rel() 中:

	/*
	 * Compute size estimates and consider_parallel flags for each base rel,
	 * then generate access paths.
	 */
	set_base_rel_sizes(root); //估算该基本表的行数和宽度
	set_base_rel_pathlists(root); //生成该基本表的 path

进入后,会根据基本表类型的不同,调用不同的函数处理,对于外部表,就是根据之前 RelOptInfo 中初始化的 fdwroutine 调用 fdw 部分的代码的,如下:

rel->fdwroutine->GetForeignRelSize(root, rel, rte->relid);
以及
rel->fdwroutine->GetForeignPaths(root, rel, rte->relid);

针对每个基本表生成 path 后,利用已经生成的 joinlist 构造一个 RELOPT_JOINREL 类型的 RelOptInfo 以及对应的 path,最后返回这个 Join 的 RelOptInfo。

/*
	 * Generate access paths for the entire join tree.
	 */
	rel = make_rel_from_joinlist(root, joinlist);

实例:

对于下列语句(其中 test1 和 test2 都是外部表),

select * from test1 left join test2 on test1.no=test2.no limit 3 offset 3;

query_planner() 不会处理其中的 “limit 3 offset 3” 那是在 grouping_planner 中之后做的事。

首先在 PlannerInfo 中初始化存储 RelOptInfo 的空间:

PlannerInfo.simple_rel_array_size = list_length(root->parse->rtable) + 1; // = 4
PlannerInfo.simple_rel_array = (RelOptInfo **)
        palloc0(root->simple_rel_array_size * sizeof(RelOptInfo *));

由于 Query 中的 RangeTable 中会有 test1、test2 以及它们之间的 Join 存在,这里的大小为 4,也就是说最多之后可能会用到 4 个 RelOptInfo,但并不是一定会,而且因为 RangeTable 是从 1 开始计数的,第一个空间是不会被使用到。

针对 test1 和 test2 表生成对应的 RelOptInfo;

add_base_rels_to_query(root, (Node *) parse->jointree);

下面是 test1 的 RelOptInfo:

(gdb) p *(RelOptInfo *)root->simple_rel_array[1]
$32 = {
  type = T_RelOptInfo, 
  reloptkind = RELOPT_BASEREL, 
  relids = 0x245c1d0, 
  rows = 0, 
  consider_startup = 1 '\001', 
  consider_param_startup = 0 '\000', 
  consider_parallel = 0 '\000', 
  reltarget = 0x245c1e8, 
  pathlist = 0x0, 
  ppilist = 0x0, 
  partial_pathlist = 0x0, 
  cheapest_startup_path = 0x0, 
  cheapest_total_path = 0x0, 
  cheapest_unique_path = 0x0, 
  cheapest_parameterized_paths = 0x0, 
  direct_lateral_relids = 0x0, 
  lateral_relids = 0x0, 
  relid = 1, 
  reltablespace = 0, 
  rtekind = RTE_RELATION, 
  min_attr = -7, 
  max_attr = 2, 
  attr_needed = 0x245c238, 
  attr_widths = 0x245c2c8, 
  lateral_vars = 0x0, 
  lateral_referencers = 0x0, 
  indexlist = 0x0, 
  pages = 0, 
  tuples = 0, 
  allvisfrac = 0, 
  subroot = 0x0, 
  subplan_params = 0x0, 
  rel_parallel_workers = -1, 
  serverid = 16399, 
  userid = 0, 
  useridiscurrent = 0 '\000', 
  fdwroutine = 0x245c318, 
  fdw_private = 0x0, 
  baserestrictinfo = 0x0, 
  baserestrictcost = {
    startup = 0, 
    per_tuple = 0
  }, 
  joininfo = 0x0, 
  has_eclass_joins = 0 '\000'
}

处理 RestrictInfo,由于原句并未包含对表的 Where 限制,所以这里只会生成一个 joinlist;

joinlist = deconstruct_jointree(&PlannerInfo);

生成的 joinlist 是一个 RTE 链表,连接着 2 个节点,分别指向 test1 和 test2 的 RTE,表示将要进行 test1 和 test2 的连接操作;

(gdb) p *(RangeTblRef *)((List *)joinlist)->head->data->ptr_value 
$29 = {
  type = T_RangeTblRef, 
  rtindex = 1
}
(gdb) p *(RangeTblRef *)((List *)joinlist)->tail->data->ptr_value 
$30 = {
  type = T_RangeTblRef, 
  rtindex = 2
}

最后根据 joinlist 生成 join RelOptInfo:

final_rel = make_one_rel(root, joinlist);
final_rel = {
  type = T_RelOptInfo, 
  reloptkind = RELOPT_JOINREL, 
  relids = 0x245f3b8, 
  rows = 9316, 
  consider_startup = 1 '\001', 
  consider_param_startup = 0 '\000', 
  consider_parallel = 0 '\000', 
  reltarget = 0x245f3d0, 
  pathlist = 0x245f9d0, 
  ppilist = 0x0, 
  partial_pathlist = 0x0, 
  cheapest_startup_path = 0x246b490, 
  cheapest_total_path = 0x245f8a0, 
  cheapest_unique_path = 0x0, 
  cheapest_parameterized_paths = 0x246b6d0, 
  direct_lateral_relids = 0x0, 
  lateral_relids = 0x0, 
  relid = 0, 
  reltablespace = 0, 
  rtekind = RTE_JOIN, 
  min_attr = 0, 
  max_attr = 0, 
  attr_needed = 0x0, 
  attr_widths = 0x0, 
  lateral_vars = 0x0, 
  lateral_referencers = 0x0, 
  indexlist = 0x0, 
  pages = 0, 
  tuples = 0, 
  allvisfrac = 0, 
  subroot = 0x0, 
  subplan_params = 0x0, 
  rel_parallel_workers = -1, 
  serverid = 16399, 
  userid = 0, 
  useridiscurrent = 0 '\000', 
  fdwroutine = 0x245c318, 
  fdw_private = 0x245fc40, 
  baserestrictinfo = 0x0, 
  baserestrictcost = {
    startup = 0, 
    per_tuple = 0
  }, 
  joininfo = 0x0, 
  has_eclass_joins = 0 '\000'
}

在名为 final_rel 的 RelOptInfo 中的 pathlist 链表中生成了 3 个 path:

MergePath,先从远端取回数据后,在本地进行 MergeJoin 处理;

HashPath,先从远端取回数据后,在本地进行 HashJoin 处理;

MergePath 和 HashPath 的 outerjoinpath 和 innerjoinpath 则分别指向一个 ForeignPath,分别进行 test1 和 test2 的 ForeignScan。

ForeignPath,将连接语句也一并发送到远端执行,所以对于本地就相当于仅有一个 ForeignScan 即 ForeignPath。

MergePath = {
  jpath = {
    path = {
      type = T_MergePath, 
      pathtype = T_MergeJoin, 
      parent = 0x245f1a8, 
      pathtarget = 0x245f3d0, 
      param_info = 0x0, 
      parallel_aware = 0 '\000', 
      parallel_safe = 0 '\000', 
      parallel_workers = 0, 
      rows = 9316, 
      startup_cost = 444.06045346876857, 
      total_cost = 590.62545346876846, 
      pathkeys = 0x0
    }, 
    jointype = JOIN_LEFT, 
    outerjoinpath = 0x245ee90, 
    innerjoinpath = 0x245efc0, 
    joinrestrictinfo = 0x245f4f0
  }, 
  path_mergeclauses = 0x245f730, 
  outersortkeys = 0x245f6e0, 
  innersortkeys = 0x245f7d0, 
  materialize_inner = 0 '\000'
}

HashPath = {
  jpath = {
    path = {
      type = T_HashPath, 
      pathtype = T_HashJoin, 
      parent = 0x245f1a8, 
      pathtarget = 0x245f3d0, 
      param_info = 0x0, 
      parallel_aware = 0 '\000', 
      parallel_safe = 0 '\000', 
      parallel_workers = 0, 
      rows = 9316, 
      startup_cost = 268.01250000000005, 
      total_cost = 647.58500000000004, 
      pathkeys = 0x0
    }, 
    jointype = JOIN_LEFT, 
    outerjoinpath = 0x245ee90, 
    innerjoinpath = 0x245efc0, 
    joinrestrictinfo = 0x245f4f0
  }, 
  path_mergeclauses = 0x245fbf0, 
  outersortkeys = 0x1, 
  innersortkeys = 0x10, 
  materialize_inner = 32 ' '
}

ForeignPath = {
  path = {
    type = T_ForeignPath, 
    pathtype = T_ForeignScan, 
    parent = 0x245f1a8, 
    pathtarget = 0x245f3d0, 
    param_info = 0x0, 
    parallel_aware = 0 '\000', 
    parallel_safe = 0 '\000', 
    parallel_workers = 0, 
    rows = 9316, 
    startup_cost = 100, 
    total_cost = 4991.6824999999999, 
    pathkeys = 0x0
  }, 
  fdw_outerpath = 0x0, 
  fdw_private = 0x0
}