78 lines
		
	
	
		
			3.3 KiB
		
	
	
	
		
			Ruby
		
	
	
	
			
		
		
	
	
			78 lines
		
	
	
		
			3.3 KiB
		
	
	
	
		
			Ruby
		
	
	
	
# frozen_string_literal: true
 | 
						|
 | 
						|
module Gitlab
 | 
						|
  module Database
 | 
						|
    module PostgresHll
 | 
						|
      # Bucket class represent data structure build with HyperLogLog algorithm
 | 
						|
      # that models data distribution in analysed set. This representation than can be used
 | 
						|
      # for following purposes
 | 
						|
      #   1. Estimating number of unique elements that this structure represents
 | 
						|
      #   2. Merging with other Buckets structure to later estimate number of unique elements in sum of two
 | 
						|
      #      represented data sets
 | 
						|
      #   3. Serializing Buckets structure to json format, that can be stored in various persistence layers
 | 
						|
      #
 | 
						|
      # @example Usage
 | 
						|
      #  ::Gitlab::Database::PostgresHll::Buckets.new(141 => 1, 56 => 1).estimated_distinct_count
 | 
						|
      #  ::Gitlab::Database::PostgresHll::Buckets.new(141 => 1, 56 => 1).merge_hash!(141 => 1, 56 => 5).estimated_distinct_count
 | 
						|
      #  ::Gitlab::Database::PostgresHll::Buckets.new(141 => 1, 56 => 1).to_json
 | 
						|
 | 
						|
      # @note HyperLogLog is an PROBABILISTIC algorithm that ESTIMATES distinct count of given attribute value for supplied relation
 | 
						|
      #  Like all probabilistic algorithm is has ERROR RATE margin, that can affect values,
 | 
						|
      #  for given implementation no higher value was reported (https://gitlab.com/gitlab-org/gitlab/-/merge_requests/45673#accuracy-estimation) than 5.3%
 | 
						|
      #  for the most of a cases this value is lower. However, if the exact value is necessary other tools has to be used.
 | 
						|
      class Buckets
 | 
						|
        TOTAL_BUCKETS = 512
 | 
						|
 | 
						|
        def initialize(buckets = {})
 | 
						|
          @buckets = buckets
 | 
						|
        end
 | 
						|
 | 
						|
        # Based on HyperLogLog structure estimates number of unique elements in analysed set.
 | 
						|
        #
 | 
						|
        # @return [Float] Estimate number of unique elements
 | 
						|
        def estimated_distinct_count
 | 
						|
          @estimated_distinct_count ||= estimate_cardinality
 | 
						|
        end
 | 
						|
 | 
						|
        # Updates instance underlying HyperLogLog structure by merging it with other HyperLogLog structure
 | 
						|
        #
 | 
						|
        # @param other_buckets_hash hash with HyperLogLog structure representation
 | 
						|
        def merge_hash!(other_buckets_hash)
 | 
						|
          buckets.merge!(other_buckets_hash) {|_key, old, new| new > old ? new : old }
 | 
						|
        end
 | 
						|
 | 
						|
        # Serialize instance underlying HyperLogLog structure to JSON format, that can be stored in various persistence layers
 | 
						|
        #
 | 
						|
        # @return [String] HyperLogLog data structure serialized to JSON
 | 
						|
        def to_json(_ = nil)
 | 
						|
          buckets.to_json
 | 
						|
        end
 | 
						|
 | 
						|
        private
 | 
						|
 | 
						|
        attr_accessor :buckets
 | 
						|
 | 
						|
        # arbitrary values that are present in #estimate_cardinality
 | 
						|
        # are sourced from https://www.sisense.com/blog/hyperloglog-in-pure-sql/
 | 
						|
        # article, they are not representing any entity and serves as tune value
 | 
						|
        # for the whole equation
 | 
						|
        def estimate_cardinality
 | 
						|
          num_zero_buckets = TOTAL_BUCKETS - buckets.size
 | 
						|
 | 
						|
          num_uniques = (
 | 
						|
            ((TOTAL_BUCKETS**2) * (0.7213 / (1 + 1.079 / TOTAL_BUCKETS))) /
 | 
						|
            (num_zero_buckets + buckets.values.sum { |bucket_hash| 2**(-1 * bucket_hash)} )
 | 
						|
          ).to_i
 | 
						|
 | 
						|
          if num_zero_buckets > 0 && num_uniques < 2.5 * TOTAL_BUCKETS
 | 
						|
            ((0.7213 / (1 + 1.079 / TOTAL_BUCKETS)) * (TOTAL_BUCKETS *
 | 
						|
              Math.log2(TOTAL_BUCKETS.to_f / num_zero_buckets)))
 | 
						|
          else
 | 
						|
            num_uniques
 | 
						|
          end
 | 
						|
        end
 | 
						|
      end
 | 
						|
    end
 | 
						|
  end
 | 
						|
end
 |