Introduce rating functions

These hotness and confidence calculation algorithms come from Reddit
and have been tweaked based on our experience on the Dillo project.
This commit is contained in:
2018-06-03 02:09:20 +02:00
parent ce6df542cc
commit 0e14bdd09f
2 changed files with 115 additions and 0 deletions

View File

@@ -0,0 +1,87 @@
# These functions come from Reddit
# https://github.com/reddit/reddit/blob/master/r2/r2/lib/db/_sorts.pyx
# Additional resources
# http://www.redditblog.com/2009/10/reddits-new-comment-sorting-system.html
# http://www.evanmiller.org/how-not-to-sort-by-average-rating.html
# http://amix.dk/blog/post/19588
from datetime import datetime, timezone
from math import log
from math import sqrt
epoch = datetime(1970, 1, 1, 0, 0, 0, 0, timezone.utc)
def epoch_seconds(date):
"""Returns the number of seconds from the epoch to date."""
td = date - epoch
return td.days * 86400 + td.seconds + (float(td.microseconds) / 1000000)
def score(ups, downs):
return ups - downs
def hot(ups, downs, date):
"""The hot formula. Reddit's hot ranking uses the logarithm function to
weight the first votes higher than the rest.
The first 10 upvotes have the same weight as the next 100 upvotes which
have the same weight as the next 1000, etc.
Dillo authors: we modified the formula to give more weight to negative
votes when an entry is controversial.
TODO: make this function more dynamic so that different defaults can be
specified depending on the item that is being rated.
"""
s = score(ups, downs)
order = log(max(abs(s), 1), 10)
sign = 1 if s > 0 else -1 if s < 0 else 0
seconds = epoch_seconds(date) - 1134028003
base_hot = round(sign * order + seconds / 45000, 7)
if downs > 1:
rating_delta = 100 * (downs - ups) / downs
if rating_delta < 25:
# The post is controversial
return base_hot
base_hot = base_hot - (downs * 6)
return base_hot
def _confidence(ups, downs):
n = ups + downs
if n == 0:
return 0
z = 1.0 #1.0 = 85%, 1.6 = 95%
phat = float(ups) / n
return sqrt(phat+z*z/(2*n)-z*((phat*(1-phat)+z*z/(4*n))/n))/(1+z*z/n)
def confidence(ups, downs):
if ups + downs == 0:
return 0
else:
return _confidence(ups, downs)
def update_hot(document):
"""Update the hotness of a document given its current ratings.
We expect the document to implement the ratings_embedded_schema in
a 'ratings' property.
"""
dt = document['_created']
dt = dt.replace(tzinfo=timezone.utc)
document['properties']['ratings']['hot'] = hot(
document['properties']['ratings']['positive'],
document['properties']['ratings']['negative'],
dt,
)