Introduce rating functions
These hotness and confidence calculation algorithms come from Reddit and have been tweaked based on our experience on the Dillo project.
This commit is contained in:
87
pillar/api/utils/rating.py
Normal file
87
pillar/api/utils/rating.py
Normal file
@@ -0,0 +1,87 @@
|
|||||||
|
# These functions come from Reddit
|
||||||
|
# https://github.com/reddit/reddit/blob/master/r2/r2/lib/db/_sorts.pyx
|
||||||
|
|
||||||
|
# Additional resources
|
||||||
|
# http://www.redditblog.com/2009/10/reddits-new-comment-sorting-system.html
|
||||||
|
# http://www.evanmiller.org/how-not-to-sort-by-average-rating.html
|
||||||
|
# http://amix.dk/blog/post/19588
|
||||||
|
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from math import log
|
||||||
|
from math import sqrt
|
||||||
|
|
||||||
|
epoch = datetime(1970, 1, 1, 0, 0, 0, 0, timezone.utc)
|
||||||
|
|
||||||
|
|
||||||
|
def epoch_seconds(date):
|
||||||
|
"""Returns the number of seconds from the epoch to date."""
|
||||||
|
td = date - epoch
|
||||||
|
return td.days * 86400 + td.seconds + (float(td.microseconds) / 1000000)
|
||||||
|
|
||||||
|
|
||||||
|
def score(ups, downs):
|
||||||
|
return ups - downs
|
||||||
|
|
||||||
|
|
||||||
|
def hot(ups, downs, date):
|
||||||
|
"""The hot formula. Reddit's hot ranking uses the logarithm function to
|
||||||
|
weight the first votes higher than the rest.
|
||||||
|
The first 10 upvotes have the same weight as the next 100 upvotes which
|
||||||
|
have the same weight as the next 1000, etc.
|
||||||
|
|
||||||
|
Dillo authors: we modified the formula to give more weight to negative
|
||||||
|
votes when an entry is controversial.
|
||||||
|
|
||||||
|
TODO: make this function more dynamic so that different defaults can be
|
||||||
|
specified depending on the item that is being rated.
|
||||||
|
"""
|
||||||
|
|
||||||
|
s = score(ups, downs)
|
||||||
|
order = log(max(abs(s), 1), 10)
|
||||||
|
sign = 1 if s > 0 else -1 if s < 0 else 0
|
||||||
|
seconds = epoch_seconds(date) - 1134028003
|
||||||
|
base_hot = round(sign * order + seconds / 45000, 7)
|
||||||
|
|
||||||
|
if downs > 1:
|
||||||
|
rating_delta = 100 * (downs - ups) / downs
|
||||||
|
if rating_delta < 25:
|
||||||
|
# The post is controversial
|
||||||
|
return base_hot
|
||||||
|
base_hot = base_hot - (downs * 6)
|
||||||
|
|
||||||
|
return base_hot
|
||||||
|
|
||||||
|
|
||||||
|
def _confidence(ups, downs):
|
||||||
|
n = ups + downs
|
||||||
|
|
||||||
|
if n == 0:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
z = 1.0 #1.0 = 85%, 1.6 = 95%
|
||||||
|
phat = float(ups) / n
|
||||||
|
return sqrt(phat+z*z/(2*n)-z*((phat*(1-phat)+z*z/(4*n))/n))/(1+z*z/n)
|
||||||
|
|
||||||
|
|
||||||
|
def confidence(ups, downs):
|
||||||
|
if ups + downs == 0:
|
||||||
|
return 0
|
||||||
|
else:
|
||||||
|
return _confidence(ups, downs)
|
||||||
|
|
||||||
|
|
||||||
|
def update_hot(document):
|
||||||
|
"""Update the hotness of a document given its current ratings.
|
||||||
|
|
||||||
|
We expect the document to implement the ratings_embedded_schema in
|
||||||
|
a 'ratings' property.
|
||||||
|
"""
|
||||||
|
|
||||||
|
dt = document['_created']
|
||||||
|
dt = dt.replace(tzinfo=timezone.utc)
|
||||||
|
|
||||||
|
document['properties']['ratings']['hot'] = hot(
|
||||||
|
document['properties']['ratings']['positive'],
|
||||||
|
document['properties']['ratings']['negative'],
|
||||||
|
dt,
|
||||||
|
)
|
@@ -184,3 +184,31 @@ class NodeSetattrTest(unittest.TestCase):
|
|||||||
|
|
||||||
node_setattr(node, 'b.complex', {None: 5})
|
node_setattr(node, 'b.complex', {None: 5})
|
||||||
self.assertEqual({'b': {'complex': {None: 5}}}, node)
|
self.assertEqual({'b': {'complex': {None: 5}}}, node)
|
||||||
|
|
||||||
|
|
||||||
|
class TestRating(unittest.TestCase):
|
||||||
|
def test_hotness(self):
|
||||||
|
"""We expect the sorted values to reflect the original order in the
|
||||||
|
list.
|
||||||
|
"""
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pillar.api.utils.rating import hot
|
||||||
|
t = datetime(2017, 2, 11, 0, 0, 0, 0, timezone.utc)
|
||||||
|
y = datetime(2017, 2, 10, 0, 0, 0, 0, timezone.utc)
|
||||||
|
w = datetime(2017, 2, 5, 0, 0, 0, 0, timezone.utc)
|
||||||
|
cases = [
|
||||||
|
(hot(1, 8, t), 'today super bad'),
|
||||||
|
(hot(0, 3, t), 'today slightly worse'),
|
||||||
|
(hot(0, 2, y), 'yesterday bad'),
|
||||||
|
(hot(0, 2, t), 'today bad'),
|
||||||
|
(hot(4, 4, w), 'last week controversial'),
|
||||||
|
(hot(7, 1, w), 'last week very good'),
|
||||||
|
(hot(5, 1, y), 'yesterday medium'),
|
||||||
|
(hot(5, 0, y), 'yesterday good'),
|
||||||
|
(hot(7, 1, y), 'yesterday very good'),
|
||||||
|
(hot(4, 4, t), 'today controversial'),
|
||||||
|
(hot(7, 1, t), 'today very good'),
|
||||||
|
]
|
||||||
|
sorted_by_hot = sorted(cases, key=lambda tup: tup[0])
|
||||||
|
for idx, t in enumerate(sorted_by_hot):
|
||||||
|
self.assertEqual(cases[idx][0], t[0])
|
||||||
|
Reference in New Issue
Block a user