Add a Mercurial commit discovery daemon
Summary: Repository import has three major steps: - Commit discovery (serial) - Message parsing (parallel, mostly VCS independent) - Change parsing (parallel, highly VCS dependent) This implements commit discovery for Mercurial, similar to git's parsing: - List the heads of all the branches. - If we haven't already discovered them, follow them back to their roots (or the first commit we have discovered). - Import all the newly discovered commits, oldest first. This is a little complicated but it ensures we discover commits in depth order, so the discovery process is robust against interruption/failure. If we just inserted commits as we went, we might read the tip, insert it, and then crash. When we ran again, we'd think we had already discovered commits older than HEAD. This also allows later stages to rely on being able to find Phabricator commit IDs which correspond to parent commits. NOTE: This importer is fairly slow because "hg" has a large startup time (compare "hg --version" to "git --version" and "svn --version"; on my machine, hg has 60ms of overhead for any command) and we need to run many commands (see the whole "hg id" mess). You can expect something like 10,000 per hour, which means you may need to run overnight to discover a large repository (IIRC, the svn/git discovery processes are both about an order of magnitude faster). We could improve this with batching, but I want to keep it as simple as possible for now. Test Plan: Discovered all the commits in the main Mercurial repository, http://selenic.com/repo/hg. Reviewers: Makinde, jungejason, nh, tuomaspelkonen, aran Reviewed By: Makinde CC: aran, Makinde Differential Revision: 943
This commit is contained in:
		| @@ -85,7 +85,7 @@ switch (isset($argv[1]) ? $argv[1] : 'help') { | ||||
|         $phid = $repository->getPHID(); | ||||
|  | ||||
|         switch ($repository->getVersionControlSystem()) { | ||||
|           case 'git': | ||||
|           case PhabricatorRepositoryType::REPOSITORY_TYPE_GIT: | ||||
|             echo "Launching 'git fetch' daemon on the {$desc} repository...\n"; | ||||
|             $control->launchDaemon( | ||||
|               'PhabricatorRepositoryGitFetchDaemon', | ||||
| @@ -99,7 +99,7 @@ switch (isset($argv[1]) ? $argv[1] : 'help') { | ||||
|                 $phid, | ||||
|               )); | ||||
|             break; | ||||
|           case 'svn': | ||||
|           case PhabricatorRepositoryType::REPOSITORY_TYPE_SVN: | ||||
|             echo "Launching discovery daemon on the {$desc} repository...\n"; | ||||
|             $control->launchDaemon( | ||||
|               'PhabricatorRepositorySvnCommitDiscoveryDaemon', | ||||
| @@ -107,6 +107,21 @@ switch (isset($argv[1]) ? $argv[1] : 'help') { | ||||
|                 $phid, | ||||
|               )); | ||||
|             break; | ||||
|           case PhabricatorRepositoryType::REPOSITORY_TYPE_MERCURIAL: | ||||
|             echo "Launching 'hg pull' daemon on the {$desc} repository...\n"; | ||||
|             $control->launchDaemon( | ||||
|               'PhabricatorRepositoryMercurialPullDaemon', | ||||
|               array( | ||||
|                 $phid, | ||||
|               )); | ||||
|             echo "Launching discovery daemon on the {$desc} repository...\n"; | ||||
|             $control->launchDaemon( | ||||
|               'PhabricatorRepositoryMercurialCommitDiscoveryDaemon', | ||||
|               array( | ||||
|                 $phid, | ||||
|               )); | ||||
|             break; | ||||
|  | ||||
|         } | ||||
|       } | ||||
|  | ||||
|   | ||||
| @@ -578,6 +578,7 @@ phutil_register_library_map(array( | ||||
|     'PhabricatorRepositoryGitHubNotification' => 'applications/repository/storage/githubnotification', | ||||
|     'PhabricatorRepositoryGitHubPostReceiveController' => 'applications/repository/controller/github-post-receive', | ||||
|     'PhabricatorRepositoryListController' => 'applications/repository/controller/list', | ||||
|     'PhabricatorRepositoryMercurialCommitDiscoveryDaemon' => 'applications/repository/daemon/commitdiscovery/mercurial', | ||||
|     'PhabricatorRepositoryMercurialPullDaemon' => 'applications/repository/daemon/mercurialpull', | ||||
|     'PhabricatorRepositoryPullLocalDaemon' => 'applications/repository/daemon/pulllocal', | ||||
|     'PhabricatorRepositoryShortcut' => 'applications/repository/storage/shortcut', | ||||
| @@ -1178,6 +1179,7 @@ phutil_register_library_map(array( | ||||
|     'PhabricatorRepositoryGitHubNotification' => 'PhabricatorRepositoryDAO', | ||||
|     'PhabricatorRepositoryGitHubPostReceiveController' => 'PhabricatorRepositoryController', | ||||
|     'PhabricatorRepositoryListController' => 'PhabricatorRepositoryController', | ||||
|     'PhabricatorRepositoryMercurialCommitDiscoveryDaemon' => 'PhabricatorRepositoryCommitDiscoveryDaemon', | ||||
|     'PhabricatorRepositoryMercurialPullDaemon' => 'PhabricatorRepositoryPullLocalDaemon', | ||||
|     'PhabricatorRepositoryPullLocalDaemon' => 'PhabricatorRepositoryDaemon', | ||||
|     'PhabricatorRepositoryShortcut' => 'PhabricatorRepositoryDAO', | ||||
|   | ||||
| @@ -0,0 +1,138 @@ | ||||
| <?php | ||||
|  | ||||
| /* | ||||
|  * Copyright 2011 Facebook, Inc. | ||||
|  * | ||||
|  * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
|  * you may not use this file except in compliance with the License. | ||||
|  * You may obtain a copy of the License at | ||||
|  * | ||||
|  *   http://www.apache.org/licenses/LICENSE-2.0 | ||||
|  * | ||||
|  * Unless required by applicable law or agreed to in writing, software | ||||
|  * distributed under the License is distributed on an "AS IS" BASIS, | ||||
|  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
|  * See the License for the specific language governing permissions and | ||||
|  * limitations under the License. | ||||
|  */ | ||||
|  | ||||
| class PhabricatorRepositoryMercurialCommitDiscoveryDaemon | ||||
|   extends PhabricatorRepositoryCommitDiscoveryDaemon { | ||||
|  | ||||
|   protected function discoverCommits() { | ||||
|     $repository = $this->getRepository(); | ||||
|  | ||||
|     $vcs = $repository->getVersionControlSystem(); | ||||
|     if ($vcs != PhabricatorRepositoryType::REPOSITORY_TYPE_MERCURIAL) { | ||||
|       throw new Exception("Repository is not a Mercurial repository."); | ||||
|     } | ||||
|  | ||||
|     $repository_phid = $repository->getPHID(); | ||||
|  | ||||
|     $repo_base = $repository->getDetail('local-path'); | ||||
|     list($stdout) = $repository->execxLocalCommand('branches'); | ||||
|  | ||||
|     $branches = ArcanistMercurialParser::parseMercurialBranches($stdout); | ||||
|     $got_something = false; | ||||
|     foreach ($branches as $name => $branch) { | ||||
|       $commit = $branch['rev']; | ||||
|       $commit = $this->getFullHash($commit); | ||||
|       if ($this->isKnownCommit($commit)) { | ||||
|         continue; | ||||
|       } else { | ||||
|         $this->discoverCommit($commit); | ||||
|         $got_something = true; | ||||
|       } | ||||
|     } | ||||
|  | ||||
|     return $got_something; | ||||
|   } | ||||
|  | ||||
|   private function getFullHash($commit) { | ||||
|  | ||||
|     // NOTE: Mercurial shortens hashes to 12 characters by default. This | ||||
|     // implies collisions with as few as a few million commits. The | ||||
|     // documentation sensibly advises "Do not use short-form IDs for | ||||
|     // long-lived representations". It then continues "You can use the | ||||
|     // --debug option to display the full changeset ID". What?! Yes, this | ||||
|     // is in fact the only way to turn on full hashes, and the hg source | ||||
|     // code is littered with "hexfn = ui.debugflag and hex or short" and | ||||
|     // similar. There is no more-selective flag or config option. | ||||
|     // | ||||
|     // Unfortunately, "hg --debug" turns on tons of other extra output, | ||||
|     // including full commit messages in "hg log" and "hg parents" (which | ||||
|     // ignore --style); this renders them unparseable. So we have to use | ||||
|     // "hg id" to convert short hashes into full hashes. See: | ||||
|     // | ||||
|     // <http://mercurial.selenic.com/wiki/ChangeSetID> | ||||
|     // | ||||
|     // Of course, this means that if there are collisions we will break here | ||||
|     // (the short commit identifier won't be unambiguous) but maybe Mercurial | ||||
|     // will have a --full-hashes flag or something by then and we can fix it | ||||
|     // properly. Until we run into that, this allows us to store data in the | ||||
|     // right format so when we eventually encounter this we won't have to | ||||
|     // reparse every Mercurial repository. | ||||
|  | ||||
|     $repository = $this->getRepository(); | ||||
|     list($stdout) = $repository->execxLocalCommand( | ||||
|       'id --debug -i --rev %s', | ||||
|       $commit); | ||||
|     return trim($stdout); | ||||
|   } | ||||
|  | ||||
|   private function discoverCommit($commit) { | ||||
|     $discover = array(); | ||||
|     $insert = array(); | ||||
|  | ||||
|     $repository = $this->getRepository(); | ||||
|  | ||||
|     $discover[] = $commit; | ||||
|     $insert[] = $commit; | ||||
|  | ||||
|     $seen_parent = array(); | ||||
|  | ||||
|     // For all the new commits at the branch heads, walk backward until we find | ||||
|     // only commits we've aleady seen. | ||||
|     while (true) { | ||||
|       $target = array_pop($discover); | ||||
|       list($stdout) = $repository->execxLocalCommand( | ||||
|         'parents --style default --rev %s', | ||||
|         $target); | ||||
|       $parents = ArcanistMercurialParser::parseMercurialLog($stdout); | ||||
|       if ($parents) { | ||||
|         foreach ($parents as $parent) { | ||||
|           $parent_commit = $parent['rev']; | ||||
|           $parent_commit = $this->getFullHash($parent_commit); | ||||
|           if (isset($seen_parent[$parent_commit])) { | ||||
|             continue; | ||||
|           } | ||||
|           $seen_parent[$parent_commit] = true; | ||||
|           if (!$this->isKnownCommit($parent_commit)) { | ||||
|             $discover[] = $parent_commit; | ||||
|             $insert[] = $parent_commit; | ||||
|           } | ||||
|         } | ||||
|       } | ||||
|       if (empty($discover)) { | ||||
|         break; | ||||
|       } | ||||
|       $this->stillWorking(); | ||||
|     } | ||||
|  | ||||
|     while (true) { | ||||
|       $target = array_pop($insert); | ||||
|       list($stdout) = $repository->execxLocalCommand( | ||||
|         'log --rev %s --template %s', | ||||
|         $target, | ||||
|         '{date|rfc822date}'); | ||||
|       $epoch = strtotime($stdout); | ||||
|  | ||||
|       $this->recordCommit($target, $epoch); | ||||
|  | ||||
|       if (empty($insert)) { | ||||
|         break; | ||||
|       } | ||||
|     } | ||||
|   } | ||||
|  | ||||
| } | ||||
| @@ -0,0 +1,15 @@ | ||||
| <?php | ||||
| /** | ||||
|  * This file is automatically generated. Lint this module to rebuild it. | ||||
|  * @generated | ||||
|  */ | ||||
|  | ||||
|  | ||||
|  | ||||
| phutil_require_module('arcanist', 'repository/parser/mercurial'); | ||||
|  | ||||
| phutil_require_module('phabricator', 'applications/repository/constants/repositorytype'); | ||||
| phutil_require_module('phabricator', 'applications/repository/daemon/commitdiscovery/base'); | ||||
|  | ||||
|  | ||||
| phutil_require_source('PhabricatorRepositoryMercurialCommitDiscoveryDaemon.php'); | ||||
		Reference in New Issue
	
	Block a user
	 epriestley
					epriestley