161 lines
		
	
	
		
			5.0 KiB
		
	
	
	
		
			Ruby
		
	
	
	
			
		
		
	
	
			161 lines
		
	
	
		
			5.0 KiB
		
	
	
	
		
			Ruby
		
	
	
	
# frozen_string_literal: true
 | 
						|
 | 
						|
# ActiveModel::Serialization (https://github.com/rails/rails/blob/v5.0.7/activemodel/lib/active_model/serialization.rb#L184)
 | 
						|
# is simple in that it recursively calls `as_json` on each object to
 | 
						|
# serialize everything. However, for a model like a Project, this can
 | 
						|
# generate a query for every single association, which can add up to tens
 | 
						|
# of thousands of queries and lead to memory bloat.
 | 
						|
#
 | 
						|
# To improve this, we can do several things:
 | 
						|
 | 
						|
# 1. Use the option tree in http://api.rubyonrails.org/classes/ActiveModel/Serializers/JSON.html
 | 
						|
#    to generate the necessary preload clauses.
 | 
						|
#
 | 
						|
# 2. We observe that a single project has many issues, merge requests,
 | 
						|
#    etc. Instead of serializing everything at once, which could lead to
 | 
						|
#    database timeouts and high memory usage, we take each top-level
 | 
						|
#    association and serialize the data in batches.
 | 
						|
#
 | 
						|
#  For example, we serialize the first 100 issues and preload all of
 | 
						|
#  their associated events, notes, etc. before moving onto the next
 | 
						|
#  batch. When we're done, we serialize merge requests in the same way.
 | 
						|
#  We repeat this pattern for the remaining associations specified in
 | 
						|
#  import_export.yml.
 | 
						|
module Gitlab
 | 
						|
  module ImportExport
 | 
						|
    class FastHashSerializer
 | 
						|
      attr_reader :subject, :tree
 | 
						|
 | 
						|
      # Usage of this class results in delayed
 | 
						|
      # serialization of relation. The serialization
 | 
						|
      # will be triggered when the `JSON.generate`
 | 
						|
      # is exected.
 | 
						|
      #
 | 
						|
      # This class uses memory-optimised, lazily
 | 
						|
      # initialised, fast to recycle relation
 | 
						|
      # serialization.
 | 
						|
      #
 | 
						|
      # The `JSON.generate` does use `#to_json`,
 | 
						|
      # that returns raw JSON content that is written
 | 
						|
      # directly to file.
 | 
						|
      class JSONBatchRelation
 | 
						|
        include Gitlab::Utils::StrongMemoize
 | 
						|
 | 
						|
        def initialize(relation, options, preloads)
 | 
						|
          @relation = relation
 | 
						|
          @options = options
 | 
						|
          @preloads = preloads
 | 
						|
        end
 | 
						|
 | 
						|
        def raw_json
 | 
						|
          strong_memoize(:raw_json) do
 | 
						|
            result = +''
 | 
						|
 | 
						|
            batch = @relation
 | 
						|
            batch = batch.preload(@preloads) if @preloads
 | 
						|
            batch.each do |item|
 | 
						|
              result.concat(",") unless result.empty?
 | 
						|
              result.concat(item.to_json(@options))
 | 
						|
            end
 | 
						|
 | 
						|
            result
 | 
						|
          end
 | 
						|
        end
 | 
						|
 | 
						|
        def to_json(options = {})
 | 
						|
          raw_json
 | 
						|
        end
 | 
						|
 | 
						|
        def as_json(*)
 | 
						|
          raise NotImplementedError
 | 
						|
        end
 | 
						|
      end
 | 
						|
 | 
						|
      BATCH_SIZE = 100
 | 
						|
 | 
						|
      def initialize(subject, tree, batch_size: BATCH_SIZE)
 | 
						|
        @subject = subject
 | 
						|
        @batch_size = batch_size
 | 
						|
        @tree = tree
 | 
						|
      end
 | 
						|
 | 
						|
      # With the usage of `JSONBatchRelation`, it returns partially
 | 
						|
      # serialized hash which is not easily accessible.
 | 
						|
      # It means you can only manipulate and replace top-level objects.
 | 
						|
      # All future mutations of the hash (such as `fix_project_tree`)
 | 
						|
      # should be aware of that.
 | 
						|
      def execute
 | 
						|
        simple_serialize.merge(serialize_includes)
 | 
						|
      end
 | 
						|
 | 
						|
      private
 | 
						|
 | 
						|
      def simple_serialize
 | 
						|
        subject.as_json(
 | 
						|
          tree.merge(include: nil, preloads: nil))
 | 
						|
      end
 | 
						|
 | 
						|
      def serialize_includes
 | 
						|
        return {} unless includes
 | 
						|
 | 
						|
        includes
 | 
						|
          .map(&method(:serialize_include_definition))
 | 
						|
          .tap { |entries| entries.compact! }
 | 
						|
          .to_h
 | 
						|
      end
 | 
						|
 | 
						|
      # definition:
 | 
						|
      # { labels: { includes: ... } }
 | 
						|
      def serialize_include_definition(definition)
 | 
						|
        raise ArgumentError, 'definition needs to be Hash' unless definition.is_a?(Hash)
 | 
						|
        raise ArgumentError, 'definition needs to have exactly one Hash element' unless definition.one?
 | 
						|
 | 
						|
        key = definition.first.first
 | 
						|
        options = definition.first.second
 | 
						|
 | 
						|
        record = subject.public_send(key) # rubocop: disable GitlabSecurity/PublicSend
 | 
						|
        return unless record
 | 
						|
 | 
						|
        serialized_record = serialize_record(key, record, options)
 | 
						|
        return unless serialized_record
 | 
						|
 | 
						|
        # `#as_json` always returns keys as `strings`
 | 
						|
        [key.to_s, serialized_record]
 | 
						|
      end
 | 
						|
 | 
						|
      def serialize_record(key, record, options)
 | 
						|
        unless record.respond_to?(:as_json)
 | 
						|
          raise "Invalid type of #{key} is #{record.class}"
 | 
						|
        end
 | 
						|
 | 
						|
        # no has-many relation
 | 
						|
        unless record.is_a?(ActiveRecord::Relation)
 | 
						|
          return record.as_json(options)
 | 
						|
        end
 | 
						|
 | 
						|
        data = []
 | 
						|
 | 
						|
        record.in_batches(of: @batch_size) do |batch| # rubocop:disable Cop/InBatches
 | 
						|
          # order each batch by it's primary key to ensure
 | 
						|
          # consistent and predictable ordering of each exported relation
 | 
						|
          # as additional `WHERE` clauses can impact the order in which data is being
 | 
						|
          # returned by database when no `ORDER` is specified
 | 
						|
          batch = batch.reorder(batch.klass.primary_key)
 | 
						|
 | 
						|
          data.append(JSONBatchRelation.new(batch, options, preloads[key]).tap(&:raw_json))
 | 
						|
        end
 | 
						|
 | 
						|
        data
 | 
						|
      end
 | 
						|
 | 
						|
      def includes
 | 
						|
        tree[:include]
 | 
						|
      end
 | 
						|
 | 
						|
      def preloads
 | 
						|
        tree[:preload]
 | 
						|
      end
 | 
						|
    end
 | 
						|
  end
 | 
						|
end
 |