From 0e14bdd09f7f3cfe83b02097e2a4b792d38c2363 Mon Sep 17 00:00:00 2001 From: Francesco Siddi Date: Sun, 3 Jun 2018 02:09:20 +0200 Subject: [PATCH] Introduce rating functions These hotness and confidence calculation algorithms come from Reddit and have been tweaked based on our experience on the Dillo project. --- pillar/api/utils/rating.py | 87 ++++++++++++++++++++++++++++++++++++ tests/test_api/test_utils.py | 28 ++++++++++++ 2 files changed, 115 insertions(+) create mode 100644 pillar/api/utils/rating.py diff --git a/pillar/api/utils/rating.py b/pillar/api/utils/rating.py new file mode 100644 index 00000000..65e8140f --- /dev/null +++ b/pillar/api/utils/rating.py @@ -0,0 +1,87 @@ +# These functions come from Reddit +# https://github.com/reddit/reddit/blob/master/r2/r2/lib/db/_sorts.pyx + +# Additional resources +# http://www.redditblog.com/2009/10/reddits-new-comment-sorting-system.html +# http://www.evanmiller.org/how-not-to-sort-by-average-rating.html +# http://amix.dk/blog/post/19588 + +from datetime import datetime, timezone +from math import log +from math import sqrt + +epoch = datetime(1970, 1, 1, 0, 0, 0, 0, timezone.utc) + + +def epoch_seconds(date): + """Returns the number of seconds from the epoch to date.""" + td = date - epoch + return td.days * 86400 + td.seconds + (float(td.microseconds) / 1000000) + + +def score(ups, downs): + return ups - downs + + +def hot(ups, downs, date): + """The hot formula. Reddit's hot ranking uses the logarithm function to + weight the first votes higher than the rest. + The first 10 upvotes have the same weight as the next 100 upvotes which + have the same weight as the next 1000, etc. + + Dillo authors: we modified the formula to give more weight to negative + votes when an entry is controversial. + + TODO: make this function more dynamic so that different defaults can be + specified depending on the item that is being rated. + """ + + s = score(ups, downs) + order = log(max(abs(s), 1), 10) + sign = 1 if s > 0 else -1 if s < 0 else 0 + seconds = epoch_seconds(date) - 1134028003 + base_hot = round(sign * order + seconds / 45000, 7) + + if downs > 1: + rating_delta = 100 * (downs - ups) / downs + if rating_delta < 25: + # The post is controversial + return base_hot + base_hot = base_hot - (downs * 6) + + return base_hot + + +def _confidence(ups, downs): + n = ups + downs + + if n == 0: + return 0 + + z = 1.0 #1.0 = 85%, 1.6 = 95% + phat = float(ups) / n + return sqrt(phat+z*z/(2*n)-z*((phat*(1-phat)+z*z/(4*n))/n))/(1+z*z/n) + + +def confidence(ups, downs): + if ups + downs == 0: + return 0 + else: + return _confidence(ups, downs) + + +def update_hot(document): + """Update the hotness of a document given its current ratings. + + We expect the document to implement the ratings_embedded_schema in + a 'ratings' property. + """ + + dt = document['_created'] + dt = dt.replace(tzinfo=timezone.utc) + + document['properties']['ratings']['hot'] = hot( + document['properties']['ratings']['positive'], + document['properties']['ratings']['negative'], + dt, + ) diff --git a/tests/test_api/test_utils.py b/tests/test_api/test_utils.py index 9d929782..46340830 100644 --- a/tests/test_api/test_utils.py +++ b/tests/test_api/test_utils.py @@ -184,3 +184,31 @@ class NodeSetattrTest(unittest.TestCase): node_setattr(node, 'b.complex', {None: 5}) self.assertEqual({'b': {'complex': {None: 5}}}, node) + + +class TestRating(unittest.TestCase): + def test_hotness(self): + """We expect the sorted values to reflect the original order in the + list. + """ + from datetime import datetime, timezone + from pillar.api.utils.rating import hot + t = datetime(2017, 2, 11, 0, 0, 0, 0, timezone.utc) + y = datetime(2017, 2, 10, 0, 0, 0, 0, timezone.utc) + w = datetime(2017, 2, 5, 0, 0, 0, 0, timezone.utc) + cases = [ + (hot(1, 8, t), 'today super bad'), + (hot(0, 3, t), 'today slightly worse'), + (hot(0, 2, y), 'yesterday bad'), + (hot(0, 2, t), 'today bad'), + (hot(4, 4, w), 'last week controversial'), + (hot(7, 1, w), 'last week very good'), + (hot(5, 1, y), 'yesterday medium'), + (hot(5, 0, y), 'yesterday good'), + (hot(7, 1, y), 'yesterday very good'), + (hot(4, 4, t), 'today controversial'), + (hot(7, 1, t), 'today very good'), + ] + sorted_by_hot = sorted(cases, key=lambda tup: tup[0]) + for idx, t in enumerate(sorted_by_hot): + self.assertEqual(cases[idx][0], t[0])