From ccff47682f7c6b6d93b284f8558e72cd47b319bf Mon Sep 17 00:00:00 2001 From: epriestley Date: Tue, 10 Jan 2017 12:21:22 -0800 Subject: [PATCH] Provide more useful guidance if a repository is clusterized into an existing multi-device cluster Summary: Fixes T12087. When transitioning into a clustered configuration for the first time, the documentation recommends using a one-device cluster as a transitional step. However, installs may not do this for whatever reason, and we aren't as clear as we could be in warning about clusterizing directly into a multi-device cluster. Roughly, when you do this, we end up believing that working copies exist on several different devices, but have no information about which copy or copies are up to date. //Usually// they all were already synchronized and are all up to date, but we can't make this assumption safely without risking data. Instead, we err on the side of caution, and require a human to tell us which copy we should consider to be up-to-date, using `bin/repository thaw --promote`. Test Plan: ``` $ ./bin/repository clusterize rLOCKS --service repos001.phacility.net Service "repos001.phacility.net" is actively bound to more than one device (local002.local, local001.phacility.net). If you clusterize a repository onto this service it will be unclear which devices have up-to-date copies of the repository. This leader/follower ambiguity will freeze the repository. You may need to manually promote a device to unfreeze it. See "Ambiguous Leaders" in the documentation for discussion. Continue anyway? [y/N] ``` Read other changes. Reviewers: chad Reviewed By: chad Maniphest Tasks: T12087 Differential Revision: https://secure.phabricator.com/D17169 --- .../DiffusionRepositoryClusterEngine.php | 5 ++- ...RepositoryManagementClusterizeWorkflow.php | 35 ++++++++++++++++++- .../user/cluster/cluster_repositories.diviner | 13 ++++--- 3 files changed, 45 insertions(+), 8 deletions(-) diff --git a/src/applications/diffusion/protocol/DiffusionRepositoryClusterEngine.php b/src/applications/diffusion/protocol/DiffusionRepositoryClusterEngine.php index 8937c1f205..4bb9ca6255 100644 --- a/src/applications/diffusion/protocol/DiffusionRepositoryClusterEngine.php +++ b/src/applications/diffusion/protocol/DiffusionRepositoryClusterEngine.php @@ -251,9 +251,8 @@ final class DiffusionRepositoryClusterEngine extends Phobject { pht( 'Repository "%s" exists on more than one device, but no device '. 'has any repository version information. Phabricator can not '. - 'guess which copy of the existing data is authoritative. Remove '. - 'all but one device from service to mark the remaining device '. - 'as the authority.', + 'guess which copy of the existing data is authoritative. Promote '. + 'a device or see "Ambigous Leaders" in the documentation.', $repository->getDisplayName())); } diff --git a/src/applications/repository/management/PhabricatorRepositoryManagementClusterizeWorkflow.php b/src/applications/repository/management/PhabricatorRepositoryManagementClusterizeWorkflow.php index 7424b84eae..7178564067 100644 --- a/src/applications/repository/management/PhabricatorRepositoryManagementClusterizeWorkflow.php +++ b/src/applications/repository/management/PhabricatorRepositoryManagementClusterizeWorkflow.php @@ -61,6 +61,7 @@ final class PhabricatorRepositoryManagementClusterizeWorkflow array( AlmanacClusterRepositoryServiceType::SERVICETYPE, )) + ->needBindings(true) ->executeOne(); if (!$service) { throw new PhutilArgumentUsageException( @@ -70,9 +71,41 @@ final class PhabricatorRepositoryManagementClusterizeWorkflow } } - if ($service) { $service_phid = $service->getPHID(); + + $bindings = $service->getActiveBindings(); + + $unique_devices = array(); + foreach ($bindings as $binding) { + $unique_devices[$binding->getDevicePHID()] = $binding->getDevice(); + } + + if (count($unique_devices) > 1) { + $device_names = mpull($unique_devices, 'getName'); + + echo id(new PhutilConsoleBlock()) + ->addParagraph( + pht( + 'Service "%s" is actively bound to more than one device (%s).', + $service_name, + implode(', ', $device_names))) + ->addParagraph( + pht( + 'If you clusterize a repository onto this service it may be '. + 'unclear which devices have up-to-date copies of the '. + 'repository. If so, leader/follower ambiguity will freeze the '. + 'repository. You may need to manually promote a device to '. + 'unfreeze it. See "Ambiguous Leaders" in the documentation '. + 'for discussion.')) + ->drawConsoleString(); + + $prompt = pht('Continue anyway?'); + if (!phutil_console_confirm($prompt)) { + throw new PhutilArgumentUsageException( + pht('User aborted the workflow.')); + } + } } else { $service_phid = null; } diff --git a/src/docs/user/cluster/cluster_repositories.diviner b/src/docs/user/cluster/cluster_repositories.diviner index 463814c890..21aba731cc 100644 --- a/src/docs/user/cluster/cluster_repositories.diviner +++ b/src/docs/user/cluster/cluster_repositories.diviner @@ -422,17 +422,22 @@ Ambiguous Leaders ================= Repository clusters can also freeze if the leader devices are ambiguous. This -can happen if you replace an entire cluster with new devices suddenly, or -make a mistake with the `--demote` flag. This generally arises from some kind -of operator error, like this: +can happen if you replace an entire cluster with new devices suddenly, or make +a mistake with the `--demote` flag. This may arise from some kind of operator +error, like these: - Someone accidentally uses `bin/repository thaw ... --demote` to demote every device in a cluster. - Someone accidentally deletes all the version information for a repository from the database by making a mistake with a `DELETE` or `UPDATE` query. - - Someone accidentally disable all of the devices in a cluster, then add + - Someone accidentally disables all of the devices in a cluster, then adds entirely new ones before repositories can propagate. +If you are moving repositories into cluster services, you can also reach this +state if you use `clusterize` to associate a repository with a service that is +bound to multiple active devices. In this case, Phabricator will not know which +device or devices have up-to-date information. + When Phabricator can not tell which device in a cluster is a leader, it freezes the cluster because it is possible that some devices have less data and others have more, and if it choses a leader arbitrarily it may destroy some data