Skip to content

Commit

Permalink
Merge pull request #302 from hathitrust/DEV-1125-duplicate-holdings-c…
Browse files Browse the repository at this point in the history
…leanup

DEV-1125: duplicate holdings cleanup
  • Loading branch information
aelkiss authored May 23, 2024
2 parents 84e1923 + 96097f3 commit 2bf7c3f
Show file tree
Hide file tree
Showing 2 changed files with 213 additions and 0 deletions.
73 changes: 73 additions & 0 deletions bin/cleanup_duplicate_holdings.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# frozen_string_literal: true

require 'services'
require 'cluster'

Services.mongo!

# Iterates through clusters, removing any duplicate holdings and logging its progress.
#
# Usage: bundle exec ruby bin/cleanup_duplicate_holdings.

class CleanupDuplicateHoldings
LOG_INTERVAL = 60

def initialize
@clusters_processed = 0
@old_holdings_processed = 0
@new_holdings_processed = 0
@last_log_time = Time.now
Services.logger.info("Starting cluster deduplication")
end

def run
Cluster.each do |cluster|
Services.logger.debug("Cleaning cluster #{cluster._id}: #{cluster.ocns}")
old_count = cluster.holdings.count
new_count = remove_duplicate_holdings(cluster)
update_progress(old_count, new_count)
end

Services.logger.info("Finished cleaning clusters")
log_progress
end

private

def update_progress(old_count, new_count)
@clusters_processed += 1
@old_holdings_processed += old_count
@new_holdings_processed += new_count

log_progress if hasnt_logged_recently?
end

def log_progress
Services.logger.info("Processed #{@clusters_processed} clusters")
Services.logger.info("Processed #{@old_holdings_processed} old holdings")
Services.logger.info("Kept #{@new_holdings_processed} holdings")
@last_log_time = Time.now
end

def hasnt_logged_recently?
!@last_log_time or (Time.now - @last_log_time > LOG_INTERVAL)
end

# Returns the count of deduped holdings
def remove_duplicate_holdings(cluster)
cluster.holdings = dedupe_holdings(cluster)
cluster.save
cluster.holdings.count
end

def dedupe_holdings(cluster)
cluster.holdings.group_by(&:update_key).map do |update_key,holdings_group|
latest_date = holdings_group.map(&:date_received).max
holdings_group.reject { |h| h.date_received != latest_date }
end.flatten
end
end

if __FILE__ == $PROGRAM_NAME
CleanupDuplicateHoldings.new.run
end
140 changes: 140 additions & 0 deletions spec/cleanup_duplicate_holdings_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
# frozen_string_literal: true

require "spec_helper"

require_relative "../bin/cleanup_duplicate_holdings"

RSpec.describe CleanupDuplicateHoldings do
def set_blank_fields(holding, value)
[:n_enum=, :n_chron=, :condition=, :issn=].each do |setter|
holding.public_send(setter, value)
end
end

def blank_fields_holding(**kwargs)
build(:holding, :all_fields, **kwargs).tap { |h| set_blank_fields(h, "") }
end

def nil_fields_dupe_holding(h)
h.clone.tap do |h2|
set_blank_fields(h2, nil)
h2._id = nil
h2.uuid = SecureRandom.uuid
h2.date_received = Date.yesterday
end
end

before(:each) { Cluster.each(&:delete) }

describe "run" do
it "cleans up duplicate holdings" do
holding = blank_fields_holding
create(:cluster, holdings: [holding, nil_fields_dupe_holding(holding)])

described_class.new.run

expect(Cluster.first.holdings.count).to eq(1)
end

it "leaves non-duplicate holdings alone" do
holding = blank_fields_holding
another_holding = blank_fields_holding
create(:cluster, holdings: [
holding,
nil_fields_dupe_holding(holding),
another_holding
])

described_class.new.run

cluster_holdings = Cluster.first.holdings
expect(cluster_holdings.length).to eq(2)
expect(cluster_holdings[0]).not_to eq(cluster_holdings[1])
end

it "cleans up duplicate holdings from multiple organizations in a cluster" do
umich_holding = blank_fields_holding(organization: "umich")
upenn_holding = blank_fields_holding(organization: "upenn")
create(:cluster, holdings: [
umich_holding,
upenn_holding,
nil_fields_dupe_holding(umich_holding),
nil_fields_dupe_holding(upenn_holding)
])

described_class.new.run

expect(Cluster.first.holdings.count).to eq(2)
expect(Cluster.first.holdings.map(&:organization).uniq).to contain_exactly("umich", "upenn")
end

it "cleans up more than two duplicate holdings in a cluster" do
holding = blank_fields_holding
create(:cluster, holdings: [
holding,
nil_fields_dupe_holding(holding),
nil_fields_dupe_holding(holding)
])

described_class.new.run

expect(Cluster.first.holdings.count).to eq(1)
end

it "cleans up multiple clusters with duplicate holdings" do
holding = blank_fields_holding
create(:cluster, holdings: [
holding,
nil_fields_dupe_holding(holding)
])

holding2 = blank_fields_holding
create(:cluster, holdings: [
holding2,
nil_fields_dupe_holding(holding2)
])

described_class.new.run

expect(Cluster.count).to eq(2)
Cluster.each do |c|
expect(c.holdings.count).to eq(1)
end
end

it "keeps the holding with the most recent date received" do
# By default, the factory creates the holding with today's date;
# the duplicate holding has yesterday's date
holding = blank_fields_holding
create(:cluster, holdings: [holding, nil_fields_dupe_holding(holding)])

described_class.new.run

expect(Cluster.first.holdings[0].date_received).to eq(Date.today)
end

it "logs what it's working on at DEBUG level" do
Services.register(:logger) { Logger.new($stdout, level: Logger::DEBUG) }

create(:cluster)

expect { described_class.new.run }.to output(/#{Cluster.first.ocns.first}/).to_stdout
end

it "logs how many clusters it's worked on" do
Services.register(:logger) { Logger.new($stdout, level: Logger::INFO) }
create(:cluster)

expect { described_class.new.run }.to output(/Processed 1 cluster/).to_stdout
end

it "logs how many holdings it's worked on" do
Services.register(:logger) { Logger.new($stdout, level: Logger::INFO) }

holding = blank_fields_holding
create(:cluster, holdings: [holding, nil_fields_dupe_holding(holding)])

expect { described_class.new.run }.to output(/Processed 2 old holdings.*Kept 1 holding/m).to_stdout
end
end
end

0 comments on commit 2bf7c3f

Please sign in to comment.