224 lines
		
	
	
		
			8.3 KiB
		
	
	
	
		
			Ruby
		
	
	
	
			
		
		
	
	
			224 lines
		
	
	
		
			8.3 KiB
		
	
	
	
		
			Ruby
		
	
	
	
| # frozen_string_literal: true
 | |
| 
 | |
| module Gitlab
 | |
|   module Database
 | |
|     # This class implements an iterator over the namespace hierarchy which uses a recursive
 | |
|     # depth-first algorithm.
 | |
|     # You can read more about the algorithm here:
 | |
|     # https://docs.gitlab.com/ee/development/database/poc_tree_iterator.html
 | |
|     #
 | |
|     # With the class, you can iterate over the whole hierarchy including subgroups and project namespaces
 | |
|     # or just iterate over the subgroups.
 | |
|     #
 | |
|     # Usage:
 | |
|     #
 | |
|     # # To invoke the iterator, you can take any group id.
 | |
|     # # Build the cursor object that will be used for tracking our position in the tree hierarchy.
 | |
|     # cursor = { current_id: 9970, depth: [9970] }
 | |
|     #
 | |
|     # # Instantiate the object.
 | |
|     # iterator = Gitlab::Database::NamespaceEachBatch.new(namespace_class: Namespace, cursor: cursor)
 | |
|     #
 | |
|     # iterator.each_batch(of: 100) do |ids|
 | |
|     #   # return namespace ids which can be Group id or Namespaces::ProjectNamespace id
 | |
|     #   puts ids
 | |
|     # end
 | |
|     #
 | |
|     # # When you need to break out of the iteration and continue later, you can yield the cursor as a second parameter:
 | |
|     # iterator.each_batch(of: 100) do |ids, new_cursor|
 | |
|     #   save_cursor(new_cursor) && break if limit_reached?
 | |
|     #   puts ids
 | |
|     # end
 | |
|     #
 | |
|     # You can build a new iterator later and resume the processing.
 | |
|     #
 | |
|     # # Building an iterator that only returns groups:
 | |
|     # iterator = Gitlab::Database::NamespaceEachBatch.new(namespace_class: Group, cursor: cursor)
 | |
|     #
 | |
|     class NamespaceEachBatch
 | |
|       PROJECTIONS = %w[current_id depth ids count index].freeze
 | |
| 
 | |
|       def initialize(namespace_class:, cursor:)
 | |
|         @namespace_class = namespace_class
 | |
|         set_cursor!(cursor)
 | |
|       end
 | |
| 
 | |
|       def each_batch(of: 500)
 | |
|         current_cursor = cursor.dup
 | |
| 
 | |
|         first_iteration = true
 | |
|         loop do
 | |
|           new_cursor, ids = load_batch(cursor: current_cursor, of: of, first_iteration: first_iteration)
 | |
|           break if new_cursor.nil?
 | |
| 
 | |
|           first_iteration = false
 | |
|           current_cursor = new_cursor
 | |
| 
 | |
|           yield ids, new_cursor
 | |
| 
 | |
|           break if new_cursor[:depth].empty?
 | |
|         end
 | |
|       end
 | |
| 
 | |
|       private
 | |
| 
 | |
|       attr_reader :namespace_class, :cursor, :namespace_id
 | |
| 
 | |
|       def load_batch(cursor:, of:, first_iteration: false)
 | |
|         recursive_scope = build_recursive_query(cursor, of, first_iteration)
 | |
| 
 | |
|         row = Namespace
 | |
|           .select(*PROJECTIONS)
 | |
|           .from(recursive_scope.arel.as(Namespace.table_name)).order(count: :desc)
 | |
|           .limit(1)
 | |
|           .first
 | |
| 
 | |
|         return [] unless row
 | |
| 
 | |
|         [{ current_id: row[:current_id], depth: row[:depth] }, row[:ids]]
 | |
|       end
 | |
| 
 | |
|       # rubocop: disable Style/AsciiComments -- Rendering a graph
 | |
|       # The depth-first algorithm is implemented here. Consider the following group hierarchy:
 | |
|       #
 | |
|       #               ┌──┐
 | |
|       #               │10│
 | |
|       #          ┌────┴──┴────┐
 | |
|       #          │            │
 | |
|       #        ┌─┴┐          ┌┴─┐
 | |
|       #        │41│          │72│
 | |
|       #        └─┬┘          └──┘
 | |
|       #          │
 | |
|       #        ┌─┴┐
 | |
|       #   ┌────┤32├─────┐
 | |
|       #   │    └─┬┘     │
 | |
|       #   │      │      │
 | |
|       # ┌─┴┐   ┌─┴┐    ┌┴─┐
 | |
|       # │11│   │12│    │18│
 | |
|       # └──┘   └──┘    └──┘
 | |
|       #
 | |
|       # 1. Start with node 10 and look up the left-hand child nodes until reaching the leaf. (walk_down)
 | |
|       # 2. While walking down, record the depth in an array and also store them in the ids array.
 | |
|       # 3. depth: 10, 41, 32, 11 | ids: 10, 41, 32, 11
 | |
|       # 4. Start collecting the ids by looking at the nodes on the deepest level. (next_elements)
 | |
|       # 5. This gives us the rest of the nodes on the same level (parent_id = 32 AND id > 11)
 | |
|       # 6. depth: 10, 41, 32, 11 | ids: 10, 41, 32, 11, 12, 18
 | |
|       # 7. When done, move one level up and pop the last value from the depth. (up_one_level)
 | |
|       # 8. depth: 10, 41, 32 | ids: 10, 41, 32, 11, 12, 18
 | |
|       # 9. Do the same, look at the nodes on the same level: no records, 32 was already collected
 | |
|       # 10. depth: 10, 41, 32 | ids: 10, 41, 32, 11, 12, 18
 | |
|       # 11. Move one level up again and look at the nodes on the same level.
 | |
|       # 12. depth: 10, 41 | ids: 10, 41, 32, 11, 12, 18, 72
 | |
|       # 13. Move one level up again, we reached the root node, iteration is done.
 | |
|       # 14. depth: 10 | ids: 10, 41, 32, 11, 12, 18, 72
 | |
|       #
 | |
|       # By tracking the currently accessed node and the depth we can stop and restore the processing of
 | |
|       # the hierarchy at any point.
 | |
|       #
 | |
|       # rubocop: enable Style/AsciiComments
 | |
|       def build_recursive_query(cursor, of, first_iteration)
 | |
|         ids = first_iteration ? cursor[:current_id] : ''
 | |
| 
 | |
|         recursive_cte = Gitlab::SQL::RecursiveCTE.new(:result,
 | |
|           union_args: {
 | |
|             remove_order: false,
 | |
|             remove_duplicates: false
 | |
|           })
 | |
| 
 | |
|         recursive_cte << base_namespace_class.select(
 | |
|           Arel.sql("#{cursor[:current_id]}::bigint").as('current_id'),
 | |
|           Arel.sql("ARRAY[#{cursor[:depth].join(',')}]::bigint[]").as('depth'),
 | |
|           Arel.sql("ARRAY[#{ids}]::bigint[]").as('ids'),
 | |
|           Arel.sql('1::bigint AS count'),
 | |
|           Arel.sql('0::bigint AS index')
 | |
|         ).from('(VALUES (1)) AS initializer_row')
 | |
|           .where_exists(namespace_exists_query)
 | |
| 
 | |
|         cte = Gitlab::SQL::CTE.new(:cte, base_namespace_class.select('result.*').from('result'))
 | |
| 
 | |
|         union_query = base_namespace_class.with(cte.to_arel).from_union(
 | |
|           walk_down,
 | |
|           next_elements,
 | |
|           up_one_level,
 | |
|           remove_duplicates: false,
 | |
|           remove_order: false
 | |
|         ).select(*PROJECTIONS).order(base_namespace_class.arel_table[:index].asc).limit(1)
 | |
| 
 | |
|         recursive_cte << union_query
 | |
| 
 | |
|         base_namespace_class.with
 | |
|           .recursive(recursive_cte.to_arel)
 | |
|           .from(recursive_cte.alias_to(namespace_class.arel_table))
 | |
|           .select(*PROJECTIONS)
 | |
|           .limit(of + 1)
 | |
|       end
 | |
| 
 | |
|       def namespace_exists_query
 | |
|         Namespace.where(id: cursor[:current_id])
 | |
|       end
 | |
| 
 | |
|       def walk_down
 | |
|         lateral_query = namespace_class
 | |
|           .select(:id)
 | |
|           .where('parent_id = cte.current_id')
 | |
|           .order(:id)
 | |
|           .limit(1)
 | |
| 
 | |
|         base_namespace_class.select(
 | |
|           base_namespace_class.arel_table[:id].as('current_id'),
 | |
|           Arel.sql("cte.depth || #{base_namespace_table}.id::bigint").as('depth'),
 | |
|           Arel.sql("cte.ids || #{base_namespace_table}.id::bigint").as('ids'),
 | |
|           Arel.sql('cte.count + 1').as('count'),
 | |
|           Arel.sql('1::bigint AS index')
 | |
|         ).from("cte, LATERAL (#{lateral_query.to_sql}) #{base_namespace_table}")
 | |
|       end
 | |
| 
 | |
|       def next_elements
 | |
|         lateral_query = namespace_class
 | |
|           .select(:id)
 | |
|           .where("#{base_namespace_table}.parent_id = cte.depth[array_length(cte.depth, 1) - 1]")
 | |
|           .where("#{base_namespace_table}.id > cte.depth[array_length(cte.depth, 1)]")
 | |
|           .order(:id)
 | |
|           .limit(1)
 | |
| 
 | |
|         base_namespace_class.select(
 | |
|           base_namespace_class.arel_table[:id].as('current_id'),
 | |
|           Arel.sql("cte.depth[:array_length(cte.depth, 1) - 1] || #{base_namespace_table}.id::bigint").as('depth'),
 | |
|           Arel.sql("cte.ids || #{base_namespace_table}.id::bigint").as('ids'),
 | |
|           Arel.sql('cte.count + 1').as('count'),
 | |
|           Arel.sql('2::bigint AS index')
 | |
|         ).from("cte, LATERAL (#{lateral_query.to_sql}) #{base_namespace_table}")
 | |
|       end
 | |
| 
 | |
|       def up_one_level
 | |
|         Namespace.select(
 | |
|           Arel.sql('cte.current_id').as('current_id'),
 | |
|           Arel.sql('cte.depth[:array_length(cte.depth, 1) - 1]').as('depth'),
 | |
|           Arel.sql('cte.ids').as('ids'),
 | |
|           Arel.sql('cte.count + 1').as('count'),
 | |
|           Arel.sql('3::bigint AS index')
 | |
|         ).from('cte')
 | |
|           .where("cte.depth <> '{}'")
 | |
|           .limit(1)
 | |
|       end
 | |
| 
 | |
|       def base_namespace_class
 | |
|         Namespace
 | |
|       end
 | |
| 
 | |
|       def base_namespace_table
 | |
|         Namespace.quoted_table_name
 | |
|       end
 | |
| 
 | |
|       def set_cursor!(original_cursor)
 | |
|         raise ArgumentError unless original_cursor[:depth].is_a?(Array)
 | |
| 
 | |
|         @cursor = {
 | |
|           current_id: Integer(original_cursor[:current_id]),
 | |
|           depth: original_cursor[:depth].map { |value| Integer(value) }
 | |
|         }
 | |
|       end
 | |
|     end
 | |
|   end
 | |
| end
 |