Introduce rating functions

These hotness and confidence calculation algorithms come from Reddit and have been tweaked based on our experience on the Dillo project.
2018-06-03 02:09:20 +02:00
parent ce6df542cc
commit 0e14bdd09f
2 changed files with 115 additions and 0 deletions
--- a/pillar/api/utils/rating.py
+++ b/pillar/api/utils/rating.py
@@ -0,0 +1,87 @@
 # These functions come from Reddit
 # https://github.com/reddit/reddit/blob/master/r2/r2/lib/db/_sorts.pyx
 # Additional resources
 # http://www.redditblog.com/2009/10/reddits-new-comment-sorting-system.html
 # http://www.evanmiller.org/how-not-to-sort-by-average-rating.html
 # http://amix.dk/blog/post/19588
 from datetime import datetime, timezone
 from math import log
 from math import sqrt
 epoch = datetime(1970, 1, 1, 0, 0, 0, 0, timezone.utc)
 def epoch_seconds(date):
    """Returns the number of seconds from the epoch to date."""
    td = date - epoch
    return td.days * 86400 + td.seconds + (float(td.microseconds) / 1000000)
 def score(ups, downs):
    return ups - downs
 def hot(ups, downs, date):
    """The hot formula. Reddit's hot ranking uses the logarithm function to
    weight the first votes higher than the rest.
    The first 10 upvotes have the same weight as the next 100 upvotes which
    have the same weight as the next 1000, etc.
    Dillo authors: we modified the formula to give more weight to negative
    votes when an entry is controversial.
    TODO: make this function more dynamic so that different defaults can be
    specified depending on the item that is being rated.
    """
    s = score(ups, downs)
    order = log(max(abs(s), 1), 10)
    sign = 1 if s > 0 else -1 if s < 0 else 0
    seconds = epoch_seconds(date) - 1134028003
    base_hot = round(sign * order + seconds / 45000, 7)
    if downs > 1:
        rating_delta = 100 * (downs - ups) / downs
        if rating_delta < 25:
            # The post is controversial
            return base_hot
        base_hot = base_hot - (downs * 6)
    return base_hot
 def _confidence(ups, downs):
    n = ups + downs
    if n == 0:
        return 0
    z = 1.0 #1.0 = 85%, 1.6 = 95%
    phat = float(ups) / n
    return sqrt(phat+z*z/(2*n)-z*((phat*(1-phat)+z*z/(4*n))/n))/(1+z*z/n)
 def confidence(ups, downs):
    if ups + downs == 0:
        return 0
    else:
        return _confidence(ups, downs)
 def update_hot(document):
    """Update the hotness of a document given its current ratings.
    We expect the document to implement the ratings_embedded_schema in
    a 'ratings' property.
    """
    dt = document['_created']
    dt = dt.replace(tzinfo=timezone.utc)
    document['properties']['ratings']['hot'] = hot(
        document['properties']['ratings']['positive'],
        document['properties']['ratings']['negative'],
        dt,
    )
--- a/tests/test_api/test_utils.py
+++ b/tests/test_api/test_utils.py
@@ -184,3 +184,31 @@ class NodeSetattrTest(unittest.TestCase):
        node_setattr(node, 'b.complex', {None: 5})
        self.assertEqual({'b': {'complex': {None: 5}}}, node)
 class TestRating(unittest.TestCase):
    def test_hotness(self):
        """We expect the sorted values to reflect the original order in the
        list.
        """
        from datetime import datetime, timezone
        from pillar.api.utils.rating import hot
        t = datetime(2017, 2, 11, 0, 0, 0, 0, timezone.utc)
        y = datetime(2017, 2, 10, 0, 0, 0, 0, timezone.utc)
        w = datetime(2017, 2, 5, 0, 0, 0, 0, timezone.utc)
        cases = [
            (hot(1, 8, t), 'today super bad'),
            (hot(0, 3, t), 'today slightly worse'),
            (hot(0, 2, y), 'yesterday bad'),
            (hot(0, 2, t), 'today bad'),
            (hot(4, 4, w), 'last week controversial'),
            (hot(7, 1, w), 'last week very good'),
            (hot(5, 1, y), 'yesterday medium'),
            (hot(5, 0, y), 'yesterday good'),
            (hot(7, 1, y), 'yesterday very good'),
            (hot(4, 4, t), 'today controversial'),
            (hot(7, 1, t), 'today very good'),
        ]
        sorted_by_hot = sorted(cases, key=lambda tup: tup[0])
        for idx, t in enumerate(sorted_by_hot):
            self.assertEqual(cases[idx][0], t[0])