utils/Reviewing/find_interesting_reviews.py - llvm - Git at Google

 #!/usr/bin/env python

 from __future__ import print_function

 import argparse
 import email.mime.multipart
 import email.mime.text
 import logging
 import os.path
 import pickle
 import re
 import smtplib
 import subprocess
 import sys
 from datetime import datetime, timedelta
 from phabricator import Phabricator

 # Setting up a virtualenv to run this script can be done by running the
 # following commands:
 # $ virtualenv venv
 # $ . ./venv/bin/activate
 # $ pip install Phabricator

 GIT_REPO_METADATA = (("llvm", "https://llvm.org/git/llvm.git"), )

 # The below PhabXXX classes represent objects as modelled by Phabricator.
 # The classes can be serialized to disk, to try and make sure that we don't
 # needlessly have to re-fetch lots of data from Phabricator, as that would
 # make this script unusably slow.


 class PhabObject:
     OBJECT_KIND = None

     def __init__(self, id):
         self.id = id


 class PhabObjectCache:
     def __init__(self, PhabObjectClass):
         self.PhabObjectClass = PhabObjectClass
         self.most_recent_info = None
         self.oldest_info = None
         self.id2PhabObjects = {}

     def get_name(self):
         return self.PhabObjectClass.OBJECT_KIND + "sCache"

     def get(self, id):
         if id not in self.id2PhabObjects:
             self.id2PhabObjects[id] = self.PhabObjectClass(id)
         return self.id2PhabObjects[id]

     def get_ids_in_cache(self):
         return list(self.id2PhabObjects.keys())

     def get_objects(self):
         return list(self.id2PhabObjects.values())

     DEFAULT_DIRECTORY = "PhabObjectCache"

     def _get_pickle_name(self, directory):
         file_name = "Phab" + self.PhabObjectClass.OBJECT_KIND + "s.pickle"
         return os.path.join(directory, file_name)

     def populate_cache_from_disk(self, directory=DEFAULT_DIRECTORY):
         """
         FIXME: consider if serializing to JSON would bring interoperability
         advantages over serializing to pickle.
         """
         try:
             f = open(self._get_pickle_name(directory), "rb")
         except IOError as err:
             print("Could not find cache. Error message: {0}. Continuing..."
                   .format(err))
         else:
             with f:
                 try:
                     d = pickle.load(f)
                     self.__dict__.update(d)
                 except EOFError as err:
                     print("Cache seems to be corrupt. " +
                           "Not using cache. Error message: {0}".format(err))

     def write_cache_to_disk(self, directory=DEFAULT_DIRECTORY):
         if not os.path.exists(directory):
             os.makedirs(directory)
         with open(self._get_pickle_name(directory), "wb") as f:
             pickle.dump(self.__dict__, f)
         print("wrote cache to disk, most_recent_info= {0}".format(
             datetime.fromtimestamp(self.most_recent_info)
             if self.most_recent_info is not None else None))


 class PhabReview(PhabObject):
     OBJECT_KIND = "Review"

     def __init__(self, id):
         PhabObject.__init__(self, id)

     def update(self, title, dateCreated, dateModified, author):
         self.title = title
         self.dateCreated = dateCreated
         self.dateModified = dateModified
         self.author = author

     def setPhabDiffs(self, phabDiffs):
         self.phabDiffs = phabDiffs


 class PhabUser(PhabObject):
     OBJECT_KIND = "User"

     def __init__(self, id):
         PhabObject.__init__(self, id)

     def update(self, phid, realName):
         self.phid = phid
         self.realName = realName


 class PhabHunk:
     def __init__(self, rest_api_hunk):
         self.oldOffset = int(rest_api_hunk["oldOffset"])
         self.oldLength = int(rest_api_hunk["oldLength"])
         # self.actual_lines_changed_offset will contain the offsets of the
         # lines that were changed in this hunk.
         self.actual_lines_changed_offset = []
         offset = self.oldOffset
         inHunk = False
         hunkStart = -1
         contextLines = 3
         for line in rest_api_hunk["corpus"].split("\n"):
             if line.startswith("+"):
                 # line is a new line that got introduced in this patch.
                 # Do not record it as a changed line.
                 if inHunk is False:
                     inHunk = True
                     hunkStart = max(self.oldOffset, offset - contextLines)
                 continue
             if line.startswith("-"):
                 # line was changed or removed from the older version of the
                 # code. Record it as a changed line.
                 if inHunk is False:
                     inHunk = True
                     hunkStart = max(self.oldOffset, offset - contextLines)
                 offset += 1
                 continue
             # line is a context line.
             if inHunk is True:
                 inHunk = False
                 hunkEnd = offset + contextLines
                 self.actual_lines_changed_offset.append((hunkStart, hunkEnd))
             offset += 1
         if inHunk is True:
             hunkEnd = offset + contextLines
             self.actual_lines_changed_offset.append((hunkStart, hunkEnd))

         # The above algorithm could result in adjacent or overlapping ranges
         # being recorded into self.actual_lines_changed_offset.
         # Merge the adjacent and overlapping ranges in there:
         t = []
         lastRange = None
         for start, end in self.actual_lines_changed_offset + \
                 [(sys.maxsize, sys.maxsize)]:
             if lastRange is None:
                 lastRange = (start, end)
             else:
                 if lastRange[1] >= start:
                     lastRange = (lastRange[0], end)
                 else:
                     t.append(lastRange)
                     lastRange = (start, end)
         self.actual_lines_changed_offset = t


 class PhabChange:
     def __init__(self, rest_api_change):
         self.oldPath = rest_api_change["oldPath"]
         self.hunks = [PhabHunk(h) for h in rest_api_change["hunks"]]


 class PhabDiff(PhabObject):
     OBJECT_KIND = "Diff"

     def __init__(self, id):
         PhabObject.__init__(self, id)

     def update(self, rest_api_results):
         self.revisionID = rest_api_results["revisionID"]
         self.dateModified = int(rest_api_results["dateModified"])
         self.dateCreated = int(rest_api_results["dateCreated"])
         self.changes = [PhabChange(c) for c in rest_api_results["changes"]]


 class ReviewsCache(PhabObjectCache):
     def __init__(self):
         PhabObjectCache.__init__(self, PhabReview)


 class UsersCache(PhabObjectCache):
     def __init__(self):
         PhabObjectCache.__init__(self, PhabUser)


 reviews_cache = ReviewsCache()
 users_cache = UsersCache()


 def init_phab_connection():
     phab = Phabricator()
     phab.update_interfaces()
     return phab


 def update_cached_info(phab, cache, phab_query, order, record_results,
                        max_nr_entries_per_fetch, max_nr_days_to_cache):
     q = phab
     LIMIT = max_nr_entries_per_fetch
     for query_step in phab_query:
         q = getattr(q, query_step)
     results = q(order=order, limit=LIMIT)
     most_recent_info, oldest_info = record_results(cache, results, phab)
     oldest_info_to_fetch = datetime.fromtimestamp(most_recent_info) - \
         timedelta(days=max_nr_days_to_cache)
     most_recent_info_overall = most_recent_info
     cache.write_cache_to_disk()
     after = results["cursor"]["after"]
     print("after: {0!r}".format(after))
     print("most_recent_info: {0}".format(
         datetime.fromtimestamp(most_recent_info)))
     while (after is not None
            and datetime.fromtimestamp(oldest_info) > oldest_info_to_fetch):
         need_more_older_data = \
             (cache.oldest_info is None or
              datetime.fromtimestamp(cache.oldest_info) > oldest_info_to_fetch)
         print(("need_more_older_data={0} cache.oldest_info={1} " +
                "oldest_info_to_fetch={2}").format(
                    need_more_older_data,
                    datetime.fromtimestamp(cache.oldest_info)
                    if cache.oldest_info is not None else None,
                    oldest_info_to_fetch))
         need_more_newer_data = \
             (cache.most_recent_info is None or
              cache.most_recent_info < most_recent_info)
         print(("need_more_newer_data={0} cache.most_recent_info={1} " +
                "most_recent_info={2}")
               .format(need_more_newer_data, cache.most_recent_info,
                       most_recent_info))
         if not need_more_older_data and not need_more_newer_data:
             break
         results = q(order=order, after=after, limit=LIMIT)
         most_recent_info, oldest_info = record_results(cache, results, phab)
         after = results["cursor"]["after"]
         print("after: {0!r}".format(after))
         print("most_recent_info: {0}".format(
             datetime.fromtimestamp(most_recent_info)))
         cache.write_cache_to_disk()
     cache.most_recent_info = most_recent_info_overall
     if after is None:
         # We did fetch all records. Mark the cache to contain all info since
         # the start of time.
         oldest_info = 0
     cache.oldest_info = oldest_info
     cache.write_cache_to_disk()


 def record_reviews(cache, reviews, phab):
     most_recent_info = None
     oldest_info = None
     for reviewInfo in reviews["data"]:
         if reviewInfo["type"] != "DREV":
             continue
         id = reviewInfo["id"]
         # phid = reviewInfo["phid"]
         dateModified = int(reviewInfo["fields"]["dateModified"])
         dateCreated = int(reviewInfo["fields"]["dateCreated"])
         title = reviewInfo["fields"]["title"]
         author = reviewInfo["fields"]["authorPHID"]
         phabReview = cache.get(id)
         if "dateModified" not in phabReview.__dict__ or \
            dateModified > phabReview.dateModified:
             diff_results = phab.differential.querydiffs(revisionIDs=[id])
             diff_ids = sorted(diff_results.keys())
             phabDiffs = []
             for diff_id in diff_ids:
                 diffInfo = diff_results[diff_id]
                 d = PhabDiff(diff_id)
                 d.update(diffInfo)
                 phabDiffs.append(d)
             phabReview.update(title, dateCreated, dateModified, author)
             phabReview.setPhabDiffs(phabDiffs)
             print("Updated D{0} modified on {1} ({2} diffs)".format(
                 id, datetime.fromtimestamp(dateModified), len(phabDiffs)))

         if most_recent_info is None:
             most_recent_info = dateModified
         elif most_recent_info < dateModified:
             most_recent_info = dateModified

         if oldest_info is None:
             oldest_info = dateModified
         elif oldest_info > dateModified:
             oldest_info = dateModified
     return most_recent_info, oldest_info


 def record_users(cache, users, phab):
     most_recent_info = None
     oldest_info = None
     for info in users["data"]:
         if info["type"] != "USER":
             continue
         id = info["id"]
         phid = info["phid"]
         dateModified = int(info["fields"]["dateModified"])
         # dateCreated = int(info["fields"]["dateCreated"])
         realName = info["fields"]["realName"]
         phabUser = cache.get(id)
         phabUser.update(phid, realName)
         if most_recent_info is None:
             most_recent_info = dateModified
         elif most_recent_info < dateModified:
             most_recent_info = dateModified
         if oldest_info is None:
             oldest_info = dateModified
         elif oldest_info > dateModified:
             oldest_info = dateModified
     return most_recent_info, oldest_info


 PHABCACHESINFO = ((reviews_cache, ("differential", "revision", "search"),
                    "updated", record_reviews, 5, 7),
                   (users_cache, ("user", "search"), "newest", record_users,
                    100, 1000))


 def load_cache():
     for cache, phab_query, order, record_results, _, _ in PHABCACHESINFO:
         cache.populate_cache_from_disk()
         print("Loaded {0} nr entries: {1}".format(
             cache.get_name(), len(cache.get_ids_in_cache())))
         print("Loaded {0} has most recent info: {1}".format(
             cache.get_name(),
             datetime.fromtimestamp(cache.most_recent_info)
             if cache.most_recent_info is not None else None))


 def update_cache(phab):
     load_cache()
     for cache, phab_query, order, record_results, max_nr_entries_per_fetch, \
             max_nr_days_to_cache in PHABCACHESINFO:
         update_cached_info(phab, cache, phab_query, order, record_results,
                            max_nr_entries_per_fetch, max_nr_days_to_cache)
         ids_in_cache = cache.get_ids_in_cache()
         print("{0} objects in {1}".format(len(ids_in_cache), cache.get_name()))
         cache.write_cache_to_disk()


 def get_most_recent_reviews(days):
     newest_reviews = sorted(
         reviews_cache.get_objects(), key=lambda r: -r.dateModified)
     if len(newest_reviews) == 0:
         return newest_reviews
     most_recent_review_time = \
         datetime.fromtimestamp(newest_reviews[0].dateModified)
     cut_off_date = most_recent_review_time - timedelta(days=days)
     result = []
     for review in newest_reviews:
         if datetime.fromtimestamp(review.dateModified) < cut_off_date:
             return result
         result.append(review)
     return result


 # All of the above code is about fetching data from Phabricator and caching it
 # on local disk. The below code contains the actual "business logic" for this
 # script.

 _userphid2realname = None


 def get_real_name_from_author(user_phid):
     global _userphid2realname
     if _userphid2realname is None:
         _userphid2realname = {}
         for user in users_cache.get_objects():
             _userphid2realname[user.phid] = user.realName
     return _userphid2realname.get(user_phid, "unknown")


 def print_most_recent_reviews(phab, days, filter_reviewers):
     msgs = []

     def add_msg(msg):
         msgs.append(msg)
         print(msg)

     newest_reviews = get_most_recent_reviews(days)
     add_msg(u"These are the reviews that look interesting to be reviewed. " +
             u"The report below has 2 sections. The first " +
             u"section is organized per review; the second section is organized "
             + u"per potential reviewer.\n")
     oldest_review = newest_reviews[-1] if len(newest_reviews) > 0 else None
     oldest_datetime = \
         datetime.fromtimestamp(oldest_review.dateModified) \
         if oldest_review else None
     add_msg((u"The report below is based on analyzing the reviews that got " +
              u"touched in the past {0} days (since {1}). " +
              u"The script found {2} such reviews.\n").format(
                  days, oldest_datetime, len(newest_reviews)))
     reviewer2reviews_and_scores = {}
     for i, review in enumerate(newest_reviews):
         matched_reviewers = find_reviewers_for_review(review)
         matched_reviewers = filter_reviewers(matched_reviewers)
         if len(matched_reviewers) == 0:
             continue
         add_msg((u"{0:>3}. https://reviews.llvm.org/D{1} by {2}\n     {3}\n" +
                  u"     Last updated on {4}").format(
                      i, review.id,
                      get_real_name_from_author(review.author), review.title,
                      datetime.fromtimestamp(review.dateModified)))
         for reviewer, scores in matched_reviewers:
             add_msg(u"    potential reviewer {0}, score {1}".format(
                 reviewer,
                 "(" + "/".join(["{0:.1f}%".format(s) for s in scores]) + ")"))
             if reviewer not in reviewer2reviews_and_scores:
                 reviewer2reviews_and_scores[reviewer] = []
             reviewer2reviews_and_scores[reviewer].append((review, scores))

     # Print out a summary per reviewer.
     for reviewer in sorted(reviewer2reviews_and_scores.keys()):
         reviews_and_scores = reviewer2reviews_and_scores[reviewer]
         reviews_and_scores.sort(key=lambda rs: rs[1], reverse=True)
         add_msg(u"\n\nSUMMARY FOR {0} (found {1} reviews):".format(
             reviewer, len(reviews_and_scores)))
         for review, scores in reviews_and_scores:
             add_msg(u"[{0}] https://reviews.llvm.org/D{1} '{2}' by {3}".format(
                 "/".join(["{0:.1f}%".format(s) for s in scores]), review.id,
                 review.title, get_real_name_from_author(review.author)))
     return "\n".join(msgs)


 def get_git_cmd_output(cmd):
     output = None
     try:
         logging.debug(cmd)
         output = subprocess.check_output(
             cmd, shell=True, stderr=subprocess.STDOUT)
     except subprocess.CalledProcessError as e:
         logging.debug(str(e))
     if output is None:
         return None
     return output.decode("utf-8", errors='ignore')


 reAuthorMail = re.compile("^author-mail <([^>]*)>.*$")


 def parse_blame_output_line_porcelain(blame_output):
     email2nr_occurences = {}
     if blame_output is None:
         return email2nr_occurences
     for line in blame_output.split('\n'):
         m = reAuthorMail.match(line)
         if m:
             author_email_address = m.group(1)
             if author_email_address not in email2nr_occurences:
                 email2nr_occurences[author_email_address] = 1
             else:
                 email2nr_occurences[author_email_address] += 1
     return email2nr_occurences


 def find_reviewers_for_diff_heuristic(diff):
     # Heuristic 1: assume good reviewers are the ones that touched the same
     # lines before as this patch is touching.
     # Heuristic 2: assume good reviewers are the ones that touched the same
     # files before as this patch is touching.
     reviewers2nr_lines_touched = {}
     reviewers2nr_files_touched = {}
     # Assume last revision before diff was modified is the revision the diff
     # applies to.
     git_repo = "git_repos/llvm"
     cmd = 'git -C {0} rev-list -n 1 --before="{1}" master'.format(
         git_repo,
         datetime.fromtimestamp(
             diff.dateModified).strftime("%Y-%m-%d %H:%M:%s"))
     base_revision = get_git_cmd_output(cmd).strip()
     logging.debug("Base revision={0}".format(base_revision))
     for change in diff.changes:
         path = change.oldPath
         # Compute heuristic 1: look at context of patch lines.
         for hunk in change.hunks:
             for start_line, end_line in hunk.actual_lines_changed_offset:
                 # Collect git blame results for authors in those ranges.
                 cmd = ("git -C {0} blame --encoding=utf-8 --date iso -f -e " +
                        "-w --line-porcelain -L {1},{2} {3} -- {4}").format(
                            git_repo, start_line, end_line, base_revision, path)
                 blame_output = get_git_cmd_output(cmd)
                 for reviewer, nr_occurences in \
                         parse_blame_output_line_porcelain(blame_output).items():
                     if reviewer not in reviewers2nr_lines_touched:
                         reviewers2nr_lines_touched[reviewer] = 0
                     reviewers2nr_lines_touched[reviewer] += nr_occurences
         # Compute heuristic 2: don't look at context, just at files touched.
         # Collect git blame results for authors in those ranges.
         cmd = ("git -C {0} blame --encoding=utf-8 --date iso -f -e -w " +
                "--line-porcelain {1} -- {2}").format(git_repo, base_revision,
                                                      path)
         blame_output = get_git_cmd_output(cmd)
         for reviewer, nr_occurences in parse_blame_output_line_porcelain(
                 blame_output).items():
             if reviewer not in reviewers2nr_files_touched:
                 reviewers2nr_files_touched[reviewer] = 0
             reviewers2nr_files_touched[reviewer] += 1

     # Compute "match scores"
     total_nr_lines = sum(reviewers2nr_lines_touched.values())
     total_nr_files = len(diff.changes)
     reviewers_matchscores = \
         [(reviewer,
           (reviewers2nr_lines_touched.get(reviewer, 0)*100.0/total_nr_lines
            if total_nr_lines != 0 else 0,
            reviewers2nr_files_touched[reviewer]*100.0/total_nr_files
            if total_nr_files != 0 else 0))
          for reviewer, nr_lines
          in reviewers2nr_files_touched.items()]
     reviewers_matchscores.sort(key=lambda i: i[1], reverse=True)
     return reviewers_matchscores


 def find_reviewers_for_review(review):
     # Process the newest diff first.
     diffs = sorted(
         review.phabDiffs, key=lambda d: d.dateModified, reverse=True)
     if len(diffs) == 0:
         return
     diff = diffs[0]
     matched_reviewers = find_reviewers_for_diff_heuristic(diff)
     # Show progress, as this is a slow operation:
     sys.stdout.write('.')
     sys.stdout.flush()
     logging.debug(u"matched_reviewers: {0}".format(matched_reviewers))
     return matched_reviewers


 def update_git_repos():
     git_repos_directory = "git_repos"
     for name, url in GIT_REPO_METADATA:
         dirname = os.path.join(git_repos_directory, name)
         if not os.path.exists(dirname):
             cmd = "git clone {0} {1}".format(url, dirname)
             output = get_git_cmd_output(cmd)
         cmd = "git -C {0} pull --rebase".format(dirname)
         output = get_git_cmd_output(cmd)


 def send_emails(email_addresses, sender, msg):
     s = smtplib.SMTP()
     s.connect()
     for email_address in email_addresses:
         email_msg = email.mime.multipart.MIMEMultipart()
         email_msg['From'] = sender
         email_msg['To'] = email_address
         email_msg['Subject'] = 'LLVM patches you may be able to review.'
         email_msg.attach(email.mime.text.MIMEText(msg.encode('utf-8'), 'plain'))
         # python 3.x: s.send_message(email_msg)
         s.sendmail(email_msg['From'], email_msg['To'], email_msg.as_string())
     s.quit()


 def filter_reviewers_to_report_for(people_to_look_for):
     # The below is just an example filter, to only report potential reviews
     # to do for the people that will receive the report email.
     return lambda potential_reviewers: [r for r in potential_reviewers
                                         if r[0] in people_to_look_for]


 def main():
     parser = argparse.ArgumentParser(
         description='Match open reviews to potential reviewers.')
     parser.add_argument(
         '--no-update-cache',
         dest='update_cache',
         action='store_false',
         default=True,
         help='Do not update cached Phabricator objects')
     parser.add_argument(
         '--email-report',
         dest='email_report',
         nargs='*',
         default="",
         help="A email addresses to send the report to.")
     parser.add_argument(
         '--sender',
         dest='sender',
         default="",
         help="The email address to use in 'From' on messages emailed out.")
     parser.add_argument(
         '--email-addresses',
         dest='email_addresses',
         nargs='*',
         help="The email addresses (as known by LLVM git) of " +
         "the people to look for reviews for.")
     parser.add_argument('--verbose', '-v', action='count')

     args = parser.parse_args()

     if args.verbose >= 1:
         logging.basicConfig(level=logging.DEBUG)

     people_to_look_for = [e.decode('utf-8') for e in args.email_addresses]
     logging.debug("Will look for reviews that following contributors could " +
                   "review: {}".format(people_to_look_for))
     logging.debug("Will email a report to: {}".format(args.email_report))

     phab = init_phab_connection()

     if args.update_cache:
         update_cache(phab)

     load_cache()
     update_git_repos()
     msg = print_most_recent_reviews(
         phab,
         days=1,
         filter_reviewers=filter_reviewers_to_report_for(people_to_look_for))

     if args.email_report != []:
         send_emails(args.email_report, args.sender, msg)


 if __name__ == "__main__":
     main()
	#!/usr/bin/env python

	from __future__ import print_function

	import argparse
	import email.mime.multipart
	import email.mime.text
	import logging
	import os.path
	import pickle
	import re
	import smtplib
	import subprocess
	import sys
	from datetime import datetime, timedelta
	from phabricator import Phabricator

	# Setting up a virtualenv to run this script can be done by running the
	# following commands:
	# $ virtualenv venv
	# $ . ./venv/bin/activate
	# $ pip install Phabricator

	GIT_REPO_METADATA = (("llvm", "https://llvm.org/git/llvm.git"), )

	# The below PhabXXX classes represent objects as modelled by Phabricator.
	# The classes can be serialized to disk, to try and make sure that we don't
	# needlessly have to re-fetch lots of data from Phabricator, as that would
	# make this script unusably slow.


	class PhabObject:
	OBJECT_KIND = None

	def __init__(self, id):
	self.id = id


	class PhabObjectCache:
	def __init__(self, PhabObjectClass):
	self.PhabObjectClass = PhabObjectClass
	self.most_recent_info = None
	self.oldest_info = None
	self.id2PhabObjects = {}

	def get_name(self):
	return self.PhabObjectClass.OBJECT_KIND + "sCache"

	def get(self, id):
	if id not in self.id2PhabObjects:
	self.id2PhabObjects[id] = self.PhabObjectClass(id)
	return self.id2PhabObjects[id]

	def get_ids_in_cache(self):
	return list(self.id2PhabObjects.keys())

	def get_objects(self):
	return list(self.id2PhabObjects.values())

	DEFAULT_DIRECTORY = "PhabObjectCache"

	def _get_pickle_name(self, directory):
	file_name = "Phab" + self.PhabObjectClass.OBJECT_KIND + "s.pickle"
	return os.path.join(directory, file_name)

	def populate_cache_from_disk(self, directory=DEFAULT_DIRECTORY):
	"""
	FIXME: consider if serializing to JSON would bring interoperability
	advantages over serializing to pickle.
	"""
	try:
	f = open(self._get_pickle_name(directory), "rb")
	except IOError as err:
	print("Could not find cache. Error message: {0}. Continuing..."
	.format(err))
	else:
	with f:
	try:
	d = pickle.load(f)
	self.__dict__.update(d)
	except EOFError as err:
	print("Cache seems to be corrupt. " +
	"Not using cache. Error message: {0}".format(err))

	def write_cache_to_disk(self, directory=DEFAULT_DIRECTORY):
	if not os.path.exists(directory):
	os.makedirs(directory)
	with open(self._get_pickle_name(directory), "wb") as f:
	pickle.dump(self.__dict__, f)
	print("wrote cache to disk, most_recent_info= {0}".format(
	datetime.fromtimestamp(self.most_recent_info)
	if self.most_recent_info is not None else None))


	class PhabReview(PhabObject):
	OBJECT_KIND = "Review"

	def __init__(self, id):
	PhabObject.__init__(self, id)

	def update(self, title, dateCreated, dateModified, author):
	self.title = title
	self.dateCreated = dateCreated
	self.dateModified = dateModified
	self.author = author

	def setPhabDiffs(self, phabDiffs):
	self.phabDiffs = phabDiffs


	class PhabUser(PhabObject):
	OBJECT_KIND = "User"

	def __init__(self, id):
	PhabObject.__init__(self, id)

	def update(self, phid, realName):
	self.phid = phid
	self.realName = realName


	class PhabHunk:
	def __init__(self, rest_api_hunk):
	self.oldOffset = int(rest_api_hunk["oldOffset"])
	self.oldLength = int(rest_api_hunk["oldLength"])
	# self.actual_lines_changed_offset will contain the offsets of the
	# lines that were changed in this hunk.
	self.actual_lines_changed_offset = []
	offset = self.oldOffset
	inHunk = False
	hunkStart = -1
	contextLines = 3
	for line in rest_api_hunk["corpus"].split("\n"):
	if line.startswith("+"):
	# line is a new line that got introduced in this patch.
	# Do not record it as a changed line.
	if inHunk is False:
	inHunk = True
	hunkStart = max(self.oldOffset, offset - contextLines)
	continue
	if line.startswith("-"):
	# line was changed or removed from the older version of the
	# code. Record it as a changed line.
	if inHunk is False:
	inHunk = True
	hunkStart = max(self.oldOffset, offset - contextLines)
	offset += 1
	continue
	# line is a context line.
	if inHunk is True:
	inHunk = False
	hunkEnd = offset + contextLines
	self.actual_lines_changed_offset.append((hunkStart, hunkEnd))
	offset += 1
	if inHunk is True:
	hunkEnd = offset + contextLines
	self.actual_lines_changed_offset.append((hunkStart, hunkEnd))

	# The above algorithm could result in adjacent or overlapping ranges
	# being recorded into self.actual_lines_changed_offset.
	# Merge the adjacent and overlapping ranges in there:
	t = []
	lastRange = None
	for start, end in self.actual_lines_changed_offset + \
	[(sys.maxsize, sys.maxsize)]:
	if lastRange is None:
	lastRange = (start, end)
	else:
	if lastRange[1] >= start:
	lastRange = (lastRange[0], end)
	else:
	t.append(lastRange)
	lastRange = (start, end)
	self.actual_lines_changed_offset = t


	class PhabChange:
	def __init__(self, rest_api_change):
	self.oldPath = rest_api_change["oldPath"]
	self.hunks = [PhabHunk(h) for h in rest_api_change["hunks"]]


	class PhabDiff(PhabObject):
	OBJECT_KIND = "Diff"

	def __init__(self, id):
	PhabObject.__init__(self, id)

	def update(self, rest_api_results):
	self.revisionID = rest_api_results["revisionID"]
	self.dateModified = int(rest_api_results["dateModified"])
	self.dateCreated = int(rest_api_results["dateCreated"])
	self.changes = [PhabChange(c) for c in rest_api_results["changes"]]


	class ReviewsCache(PhabObjectCache):
	def __init__(self):
	PhabObjectCache.__init__(self, PhabReview)


	class UsersCache(PhabObjectCache):
	def __init__(self):
	PhabObjectCache.__init__(self, PhabUser)


	reviews_cache = ReviewsCache()
	users_cache = UsersCache()


	def init_phab_connection():
	phab = Phabricator()
	phab.update_interfaces()
	return phab


	def update_cached_info(phab, cache, phab_query, order, record_results,
	max_nr_entries_per_fetch, max_nr_days_to_cache):
	q = phab
	LIMIT = max_nr_entries_per_fetch
	for query_step in phab_query:
	q = getattr(q, query_step)
	results = q(order=order, limit=LIMIT)
	most_recent_info, oldest_info = record_results(cache, results, phab)
	oldest_info_to_fetch = datetime.fromtimestamp(most_recent_info) - \
	timedelta(days=max_nr_days_to_cache)
	most_recent_info_overall = most_recent_info
	cache.write_cache_to_disk()
	after = results["cursor"]["after"]
	print("after: {0!r}".format(after))
	print("most_recent_info: {0}".format(
	datetime.fromtimestamp(most_recent_info)))
	while (after is not None
	and datetime.fromtimestamp(oldest_info) > oldest_info_to_fetch):
	need_more_older_data = \
	(cache.oldest_info is None or
	datetime.fromtimestamp(cache.oldest_info) > oldest_info_to_fetch)
	print(("need_more_older_data={0} cache.oldest_info={1} " +
	"oldest_info_to_fetch={2}").format(
	need_more_older_data,
	datetime.fromtimestamp(cache.oldest_info)
	if cache.oldest_info is not None else None,
	oldest_info_to_fetch))
	need_more_newer_data = \
	(cache.most_recent_info is None or
	cache.most_recent_info < most_recent_info)
	print(("need_more_newer_data={0} cache.most_recent_info={1} " +
	"most_recent_info={2}")
	.format(need_more_newer_data, cache.most_recent_info,
	most_recent_info))
	if not need_more_older_data and not need_more_newer_data:
	break
	results = q(order=order, after=after, limit=LIMIT)
	most_recent_info, oldest_info = record_results(cache, results, phab)
	after = results["cursor"]["after"]
	print("after: {0!r}".format(after))
	print("most_recent_info: {0}".format(
	datetime.fromtimestamp(most_recent_info)))
	cache.write_cache_to_disk()
	cache.most_recent_info = most_recent_info_overall
	if after is None:
	# We did fetch all records. Mark the cache to contain all info since
	# the start of time.
	oldest_info = 0
	cache.oldest_info = oldest_info
	cache.write_cache_to_disk()


	def record_reviews(cache, reviews, phab):
	most_recent_info = None
	oldest_info = None
	for reviewInfo in reviews["data"]:
	if reviewInfo["type"] != "DREV":
	continue
	id = reviewInfo["id"]
	# phid = reviewInfo["phid"]
	dateModified = int(reviewInfo["fields"]["dateModified"])
	dateCreated = int(reviewInfo["fields"]["dateCreated"])
	title = reviewInfo["fields"]["title"]
	author = reviewInfo["fields"]["authorPHID"]
	phabReview = cache.get(id)
	if "dateModified" not in phabReview.__dict__ or \
	dateModified > phabReview.dateModified:
	diff_results = phab.differential.querydiffs(revisionIDs=[id])
	diff_ids = sorted(diff_results.keys())
	phabDiffs = []
	for diff_id in diff_ids:
	diffInfo = diff_results[diff_id]
	d = PhabDiff(diff_id)
	d.update(diffInfo)
	phabDiffs.append(d)
	phabReview.update(title, dateCreated, dateModified, author)
	phabReview.setPhabDiffs(phabDiffs)
	print("Updated D{0} modified on {1} ({2} diffs)".format(
	id, datetime.fromtimestamp(dateModified), len(phabDiffs)))

	if most_recent_info is None:
	most_recent_info = dateModified
	elif most_recent_info < dateModified:
	most_recent_info = dateModified

	if oldest_info is None:
	oldest_info = dateModified
	elif oldest_info > dateModified:
	oldest_info = dateModified
	return most_recent_info, oldest_info


	def record_users(cache, users, phab):
	most_recent_info = None
	oldest_info = None
	for info in users["data"]:
	if info["type"] != "USER":
	continue
	id = info["id"]
	phid = info["phid"]
	dateModified = int(info["fields"]["dateModified"])
	# dateCreated = int(info["fields"]["dateCreated"])
	realName = info["fields"]["realName"]
	phabUser = cache.get(id)
	phabUser.update(phid, realName)
	if most_recent_info is None:
	most_recent_info = dateModified
	elif most_recent_info < dateModified:
	most_recent_info = dateModified
	if oldest_info is None:
	oldest_info = dateModified
	elif oldest_info > dateModified:
	oldest_info = dateModified
	return most_recent_info, oldest_info


	PHABCACHESINFO = ((reviews_cache, ("differential", "revision", "search"),
	"updated", record_reviews, 5, 7),
	(users_cache, ("user", "search"), "newest", record_users,
	100, 1000))


	def load_cache():
	for cache, phab_query, order, record_results, _, _ in PHABCACHESINFO:
	cache.populate_cache_from_disk()
	print("Loaded {0} nr entries: {1}".format(
	cache.get_name(), len(cache.get_ids_in_cache())))
	print("Loaded {0} has most recent info: {1}".format(
	cache.get_name(),
	datetime.fromtimestamp(cache.most_recent_info)
	if cache.most_recent_info is not None else None))


	def update_cache(phab):
	load_cache()
	for cache, phab_query, order, record_results, max_nr_entries_per_fetch, \
	max_nr_days_to_cache in PHABCACHESINFO:
	update_cached_info(phab, cache, phab_query, order, record_results,
	max_nr_entries_per_fetch, max_nr_days_to_cache)
	ids_in_cache = cache.get_ids_in_cache()
	print("{0} objects in {1}".format(len(ids_in_cache), cache.get_name()))
	cache.write_cache_to_disk()


	def get_most_recent_reviews(days):
	newest_reviews = sorted(
	reviews_cache.get_objects(), key=lambda r: -r.dateModified)
	if len(newest_reviews) == 0:
	return newest_reviews
	most_recent_review_time = \
	datetime.fromtimestamp(newest_reviews[0].dateModified)
	cut_off_date = most_recent_review_time - timedelta(days=days)
	result = []
	for review in newest_reviews:
	if datetime.fromtimestamp(review.dateModified) < cut_off_date:
	return result
	result.append(review)
	return result


	# All of the above code is about fetching data from Phabricator and caching it
	# on local disk. The below code contains the actual "business logic" for this
	# script.

	_userphid2realname = None


	def get_real_name_from_author(user_phid):
	global _userphid2realname
	if _userphid2realname is None:
	_userphid2realname = {}
	for user in users_cache.get_objects():
	_userphid2realname[user.phid] = user.realName
	return _userphid2realname.get(user_phid, "unknown")


	def print_most_recent_reviews(phab, days, filter_reviewers):
	msgs = []

	def add_msg(msg):
	msgs.append(msg)
	print(msg)

	newest_reviews = get_most_recent_reviews(days)
	add_msg(u"These are the reviews that look interesting to be reviewed. " +
	u"The report below has 2 sections. The first " +
	u"section is organized per review; the second section is organized "
	+ u"per potential reviewer.\n")
	oldest_review = newest_reviews[-1] if len(newest_reviews) > 0 else None
	oldest_datetime = \
	datetime.fromtimestamp(oldest_review.dateModified) \
	if oldest_review else None
	add_msg((u"The report below is based on analyzing the reviews that got " +
	u"touched in the past {0} days (since {1}). " +
	u"The script found {2} such reviews.\n").format(
	days, oldest_datetime, len(newest_reviews)))
	reviewer2reviews_and_scores = {}
	for i, review in enumerate(newest_reviews):
	matched_reviewers = find_reviewers_for_review(review)
	matched_reviewers = filter_reviewers(matched_reviewers)
	if len(matched_reviewers) == 0:
	continue
	add_msg((u"{0:>3}. https://reviews.llvm.org/D{1} by {2}\n {3}\n" +
	u" Last updated on {4}").format(
	i, review.id,
	get_real_name_from_author(review.author), review.title,
	datetime.fromtimestamp(review.dateModified)))
	for reviewer, scores in matched_reviewers:
	add_msg(u" potential reviewer {0}, score {1}".format(
	reviewer,
	"(" + "/".join(["{0:.1f}%".format(s) for s in scores]) + ")"))
	if reviewer not in reviewer2reviews_and_scores:
	reviewer2reviews_and_scores[reviewer] = []
	reviewer2reviews_and_scores[reviewer].append((review, scores))

	# Print out a summary per reviewer.
	for reviewer in sorted(reviewer2reviews_and_scores.keys()):
	reviews_and_scores = reviewer2reviews_and_scores[reviewer]
	reviews_and_scores.sort(key=lambda rs: rs[1], reverse=True)
	add_msg(u"\n\nSUMMARY FOR {0} (found {1} reviews):".format(
	reviewer, len(reviews_and_scores)))
	for review, scores in reviews_and_scores:
	add_msg(u"[{0}] https://reviews.llvm.org/D{1} '{2}' by {3}".format(
	"/".join(["{0:.1f}%".format(s) for s in scores]), review.id,
	review.title, get_real_name_from_author(review.author)))
	return "\n".join(msgs)


	def get_git_cmd_output(cmd):
	output = None
	try:
	logging.debug(cmd)
	output = subprocess.check_output(
	cmd, shell=True, stderr=subprocess.STDOUT)
	except subprocess.CalledProcessError as e:
	logging.debug(str(e))
	if output is None:
	return None
	return output.decode("utf-8", errors='ignore')


	reAuthorMail = re.compile("^author-mail <([^>])>.$")


	def parse_blame_output_line_porcelain(blame_output):
	email2nr_occurences = {}
	if blame_output is None:
	return email2nr_occurences
	for line in blame_output.split('\n'):
	m = reAuthorMail.match(line)
	if m:
	author_email_address = m.group(1)
	if author_email_address not in email2nr_occurences:
	email2nr_occurences[author_email_address] = 1
	else:
	email2nr_occurences[author_email_address] += 1
	return email2nr_occurences


	def find_reviewers_for_diff_heuristic(diff):
	# Heuristic 1: assume good reviewers are the ones that touched the same
	# lines before as this patch is touching.
	# Heuristic 2: assume good reviewers are the ones that touched the same
	# files before as this patch is touching.
	reviewers2nr_lines_touched = {}
	reviewers2nr_files_touched = {}
	# Assume last revision before diff was modified is the revision the diff
	# applies to.
	git_repo = "git_repos/llvm"
	cmd = 'git -C {0} rev-list -n 1 --before="{1}" master'.format(
	git_repo,
	datetime.fromtimestamp(
	diff.dateModified).strftime("%Y-%m-%d %H:%M:%s"))
	base_revision = get_git_cmd_output(cmd).strip()
	logging.debug("Base revision={0}".format(base_revision))
	for change in diff.changes:
	path = change.oldPath
	# Compute heuristic 1: look at context of patch lines.
	for hunk in change.hunks:
	for start_line, end_line in hunk.actual_lines_changed_offset:
	# Collect git blame results for authors in those ranges.
	cmd = ("git -C {0} blame --encoding=utf-8 --date iso -f -e " +
	"-w --line-porcelain -L {1},{2} {3} -- {4}").format(
	git_repo, start_line, end_line, base_revision, path)
	blame_output = get_git_cmd_output(cmd)
	for reviewer, nr_occurences in \
	parse_blame_output_line_porcelain(blame_output).items():
	if reviewer not in reviewers2nr_lines_touched:
	reviewers2nr_lines_touched[reviewer] = 0
	reviewers2nr_lines_touched[reviewer] += nr_occurences
	# Compute heuristic 2: don't look at context, just at files touched.
	# Collect git blame results for authors in those ranges.
	cmd = ("git -C {0} blame --encoding=utf-8 --date iso -f -e -w " +
	"--line-porcelain {1} -- {2}").format(git_repo, base_revision,
	path)
	blame_output = get_git_cmd_output(cmd)
	for reviewer, nr_occurences in parse_blame_output_line_porcelain(
	blame_output).items():
	if reviewer not in reviewers2nr_files_touched:
	reviewers2nr_files_touched[reviewer] = 0
	reviewers2nr_files_touched[reviewer] += 1

	# Compute "match scores"
	total_nr_lines = sum(reviewers2nr_lines_touched.values())
	total_nr_files = len(diff.changes)
	reviewers_matchscores = \
	[(reviewer,
	(reviewers2nr_lines_touched.get(reviewer, 0)*100.0/total_nr_lines
	if total_nr_lines != 0 else 0,
	reviewers2nr_files_touched[reviewer]*100.0/total_nr_files
	if total_nr_files != 0 else 0))
	for reviewer, nr_lines
	in reviewers2nr_files_touched.items()]
	reviewers_matchscores.sort(key=lambda i: i[1], reverse=True)
	return reviewers_matchscores


	def find_reviewers_for_review(review):
	# Process the newest diff first.
	diffs = sorted(
	review.phabDiffs, key=lambda d: d.dateModified, reverse=True)
	if len(diffs) == 0:
	return
	diff = diffs[0]
	matched_reviewers = find_reviewers_for_diff_heuristic(diff)
	# Show progress, as this is a slow operation:
	sys.stdout.write('.')
	sys.stdout.flush()
	logging.debug(u"matched_reviewers: {0}".format(matched_reviewers))
	return matched_reviewers


	def update_git_repos():
	git_repos_directory = "git_repos"
	for name, url in GIT_REPO_METADATA:
	dirname = os.path.join(git_repos_directory, name)
	if not os.path.exists(dirname):
	cmd = "git clone {0} {1}".format(url, dirname)
	output = get_git_cmd_output(cmd)
	cmd = "git -C {0} pull --rebase".format(dirname)
	output = get_git_cmd_output(cmd)


	def send_emails(email_addresses, sender, msg):
	s = smtplib.SMTP()
	s.connect()
	for email_address in email_addresses:
	email_msg = email.mime.multipart.MIMEMultipart()
	email_msg['From'] = sender
	email_msg['To'] = email_address
	email_msg['Subject'] = 'LLVM patches you may be able to review.'
	email_msg.attach(email.mime.text.MIMEText(msg.encode('utf-8'), 'plain'))
	# python 3.x: s.send_message(email_msg)
	s.sendmail(email_msg['From'], email_msg['To'], email_msg.as_string())
	s.quit()


	def filter_reviewers_to_report_for(people_to_look_for):
	# The below is just an example filter, to only report potential reviews
	# to do for the people that will receive the report email.
	return lambda potential_reviewers: [r for r in potential_reviewers
	if r[0] in people_to_look_for]


	def main():
	parser = argparse.ArgumentParser(
	description='Match open reviews to potential reviewers.')
	parser.add_argument(
	'--no-update-cache',
	dest='update_cache',
	action='store_false',
	default=True,
	help='Do not update cached Phabricator objects')
	parser.add_argument(
	'--email-report',
	dest='email_report',
	nargs='*',
	default="",
	help="A email addresses to send the report to.")
	parser.add_argument(
	'--sender',
	dest='sender',
	default="",
	help="The email address to use in 'From' on messages emailed out.")
	parser.add_argument(
	'--email-addresses',
	dest='email_addresses',
	nargs='*',
	help="The email addresses (as known by LLVM git) of " +
	"the people to look for reviews for.")
	parser.add_argument('--verbose', '-v', action='count')

	args = parser.parse_args()

	if args.verbose >= 1:
	logging.basicConfig(level=logging.DEBUG)

	people_to_look_for = [e.decode('utf-8') for e in args.email_addresses]
	logging.debug("Will look for reviews that following contributors could " +
	"review: {}".format(people_to_look_for))
	logging.debug("Will email a report to: {}".format(args.email_report))

	phab = init_phab_connection()

	if args.update_cache:
	update_cache(phab)

	load_cache()
	update_git_repos()
	msg = print_most_recent_reviews(
	phab,
	days=1,
	filter_reviewers=filter_reviewers_to_report_for(people_to_look_for))

	if args.email_report != []:
	send_emails(args.email_report, args.sender, msg)


	if __name__ == "__main__":
	main()