csa
/
scopus


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415
							# Get new publications
# Publication and citations retrieval
# A. Kopmann, 6.2.17 (ak)
#
# Scope:
# Publications are once added to wordpressas a post or comment.
# Afterwards scopus will not change or modify anything any more !!!
# Update is completely in the resonsibility of the ufo users
#

# Todo:
# - add mail to author button
# - save full scopus data in the database
#

# Configuration - Scopus

import datetime
import requests
import json
import os.path

from my_scopus import MY_API_KEY
from my_scopus import ak, csa, pdv, ufo, ufo_pdv, ufo_ips, ufo_eps, ufo_apps
from ak_scopus import get_scopus_list, get_scopus_data, get_scopus_refs


from ak_wordpress import wordpress_post_by_scopus, wordpress_comment_by_scopus, wordpress_get_post


# Mysql persistent data (Accout: scopus, $scopus$)
import pymysql.cursors
import pymysql

db_host = 'localhost'
db_user = 'scopus'
db_pw = '$scopus$'
db_name = 'scopus'

log_file = '/Users/kopmann/scopus-publications.log'


# Summary
npubs = 0
nnewpubs= 0
ncites = 0
nnewcites = 0


# Read publications of a list of authors and store in the database
def update_publications(authids,authname='',scopus_opts = '',max=0):


    print "=== Update of publications for the author group: " + authname
    #print str(authids)


    # Connect to the database
    connection = pymysql.connect(host=db_host,user=db_user,password=db_pw,db=db_name,charset='utf8mb4',cursorclass=pymysql.cursors.DictCursor)

    # Request all publications of a list of authors (in one query)
    # Result: list of records with (scopus ids, eid, citedbycount)
    # The citation could be used later also by wordpress (may be via a plugin)

    publist = get_scopus_list(authids,scopus_opts,max)
    #publist = get_scopus_list(authids, scopus_opts, 3)
    #publist = get_scopus_list(authids, '(PUBYEAR AFT 2014)')
    print "Total number of publications: %d" % len(publist)
    #print publist


    # Save all publication to the publication database
    try:
        with connection.cursor() as cursor:
            for pub in publist:
                # 1 / Create a new records
                #print pub # Todo: strip the prefix SCOPUS_ID?!
                sql = "INSERT IGNORE INTO `publications` (`scopusid`,`eid`) VALUES (%s,%s)"
                cursor.execute(sql, (pub[0],pub[1]))
                    
                sql = "UPDATE `publications` SET `citedbycount` = %s WHERE `scopusid` = %s"
                cursor.execute(sql, (pub[2],pub[0]))

                # 2 / Add categories
                if len(authname) > 0:
                    catlist = []
                    sql = "SELECT categories FROM publications WHERE scopusid = %s"
                    cursor.execute(sql, (pub[0]))
                    result = cursor.fetchall()
                    if len(result) > 0:
                        #print "Categories %s" % result[0]['categories']
                        cat = result[0]['categories']

                    try:
                        catlist = json.loads(cat)
                    except TypeError:
                        print("No categories upto now")

                    if authname not in catlist:
                        catlist += [authname]
                
                    sql = "UPDATE `publications` SET `categories` = %s WHERE `scopusid` = %s"
                    cursor.execute(sql, (json.dumps(catlist),pub[0]))
                        

            # connection is not autocommit by default. So you must commit to save
            # your changes.
            connection.commit()

    finally:
        connection.close()


# Read all citations and store in the citation table
def update_citations():
    global npubs
    
    print ""
    print "=== Update citatation of all publication in the database"
    
    # Connect to the database
    connection = pymysql.connect(host=db_host,user=db_user,password=db_pw,db=db_name,charset='utf8mb4',cursorclass=pymysql.cursors.DictCursor)

    # Loop over the publications and read all citations from scopus

    # Todo: Shift to a separate script !?
    try:
        with connection.cursor() as cursor:
            # Read a single record
            sql = "SELECT wpid,eid,citedbycount,citesloaded FROM publications WHERE wpid > 0"
            cursor.execute(sql)
            result = cursor.fetchall()
          
            print "Total number of publications is %d" % len(result)
            npubs = len(result)
            #print "Npubs = %d" % npubs

            for pub in result:
                wpid = int(pub['wpid'])
                if pub['citedbycount'] is None:
                    citedbycount = 0
                else:
                    citedbycount = int(pub['citedbycount'])
                if pub['citesloaded'] is None:
                    citesloaded = 0
                else:
                    citesloaded = int(pub['citesloaded'])

                # read list of citations
                if pub['eid'] and (citedbycount > citesloaded):
                    
                    print "Processing %d = %s previously cited by %d"  % (wpid, pub['eid'], citesloaded)

                    data = get_scopus_refs(pub['eid'])
                    #print json.dumps(data,sort_keys=True,indent=4, separators=(',', ': '))
                    
                    n = len(data)
                    #print "Number of citations loaded for processing %d" % n
                    #print data
                    
                    if n > 0:
                        for pub in data:
                            #print pub['eid'] + '  ' + pub['dc:title']
                           
                            try:
                                pubstr = json.dumps(pub)
                            except TypeError:
                                print("Error serializing pub entry")

                            # save all comments to the database
                            # wirte complete scopus data of the article !?
                            sql = "INSERT IGNORE INTO `citations` (`wpid`,`scopusid`,`eid`,`scopusdata`) VALUES (%s,%s,%s,%s)"
                            cursor.execute(sql, (wpid,pub['dc:identifier'],pub['eid'],pubstr))
                            connection.commit()

                        # Update the number of cites for this article
                        if n > citesloaded:
                            print "New citations found %d -> %d" %(citesloaded,n)
                            sql = "UPDATE `publications` SET `citesloaded`=" + str(n) + " WHERE wpid=" + str(wpid)
                            #print sql
                            cursor.execute(sql)
                            connection.commit()


    finally:
        connection.close()


# Create wordpress posts for all entries that have none
def update_wp_posts():
    global nnewpubs

    print ""
    print "=== Create posts for newly registered publication in scopus"

    # Connect to the database
    connection = pymysql.connect(host=db_host,user=db_user,password=db_pw,db=db_name,charset='utf8mb4',cursorclass=pymysql.cursors.DictCursor)
        

    # Todo: Shift to a separate script !?
    try:
        with connection.cursor() as cursor:
            # Count all publications
            #sql = "SELECT COUNT(id) FROM publications"
            #cursor.execute(sql)
            #result = cursor.fetchall()
            #if len(result) > 0:
            #print result[0]['COUNT(id)']

            # Read a single record
            sql = "SELECT scopusid,categories FROM publications WHERE wpid IS NULL"
            cursor.execute(sql)
            result = cursor.fetchall()
            if len(result) > 0:
                print "Number of new publications is %d" % len(result)
                nnewpubs = len(result)
            else:
                print "Nothing new found"


        # Retrieve all information required for the wordpress page
        for pub in result:
            print "Processing " + pub['scopusid'] + " categories " + pub['categories']

            data = get_scopus_data(pub['scopusid'])
            #print json.dumps(data, sort_keys=True, indent=4, separators=(',', ': '))

            # Parse categories
            catlist = []
            try:
                catlist = json.loads(pub['categories'])
            except TypeError:
                print("No categories specified")

            wpid = wordpress_post_by_scopus(data, catlist)
            
            #print wpid
            #print pub['scopusid']

            # Warning: the resulting string uses double quotes (") so use
            # single quotes (') for the sql command
            datastr = json.dumps(data)
            #print datastr

            # Update publication database !!!
            with connection.cursor() as cursor:
                # Read a single record
                #sql = "UPDATE publications SET wpid=" + str(wpid) + ",scopusdata='" + datastr + "' WHERE scopusid = '" + pub['scopusid'] + "'"
                sql = "UPDATE publications SET wpid=" + str(wpid) + " WHERE scopusid = '" + pub['scopusid'] + "'"
                cursor.execute(sql)
                connection.commit()

    finally:
        connection.close()


def update_wp_comments():
    global ncites
    global nnewcites
    
    print ""
    print "=== Create comments for newly registered citations in scopus"
    
    
    # Connect to the database
    connection = pymysql.connect(host=db_host,user=db_user,password=db_pw,db=db_name,charset='utf8mb4',cursorclass=pymysql.cursors.DictCursor)
    
    
    # Todo: Shift to a separate script !?
    try:
        with connection.cursor() as cursor:
            # Count all citations
            sql = "SELECT COUNT(id) FROM citations"
            cursor.execute(sql)
            result = cursor.fetchall()
            if len(result) > 0:
                ncites = result[0]['COUNT(id)']
                #print result[0]['COUNT(id)']

            # Read a single record
            sql = "SELECT id,wpid,scopusdata FROM citations WHERE wpcommentid IS NULL"
            cursor.execute(sql)
            result = cursor.fetchall()
            print "Number of new citations is %d" % len(result)
            nnewcites = len(result)

            for pub in result:
                wpid = int(pub['wpid'])
                print "Processing post " + str(wpid)
            
                data = []
                try:
                    data = json.loads(pub['scopusdata'])
                except TypeError:
                    print("Scopus data missing?!")

                # If the creation of the comment fails, the wpcommentid 0 is
                # written to the database. This means, there is no second try
                # to get this citations added.
                # All failed comments can be found by searching for wpcommentid = 0
                #
                wpcommentid = 0
                try:
                    wpcommentid = wordpress_comment_by_scopus(wpid, data)
                except:
                    print "Error: Submission of comment failed"

                sql = "UPDATE citations SET wpcommentid=" + str(wpcommentid) + " WHERE id = '" + str(pub['id']) + "'"
                cursor.execute(sql)
                connection.commit()

    finally:
        connection.close()


# Todo: Add a script to save the data for all publications in the database!!!
# There was some problem before?!
#

# Todo: Add scripts to check consistence in the database
# and fix problems if detected
# E.g. search for wpcommentid == 0
# Check if, wp posts + comments are still availabe, display
# deleted entries
#


# Main

start = datetime.datetime.now()

print ""
print "***********************************************"
print "**** scopus-get-publications / " + start.strftime("%Y-%m-%d") + " *****"
print "***********************************************"
print ""


# Update publaction database; search for new publications
# Loop over all user groups defined in ak_scopus.py

# Todo: Detect, if there is no access to scopus !!!
#


# Define the author, that should be considered
#authors = ["Computing", ufo_pdv]
#print authors

#update_publications(ufo_pdv, "Computing", '(PUBYEAR AFT 2006)',10)
update_publications(ufo_pdv, "Computing", '(PUBYEAR AFT 2006)')
update_publications(ufo_ips, "X-ray Imaging", '(PUBYEAR AFT 2010)')
update_publications(ufo_eps, "Electronics", '(PUBYEAR AFT 2010)')
update_publications(ufo_apps, "Morphology", '(PUBYEAR AFT 2010)')


update_wp_posts()

# read all citations
# Todo: read only new citations?!


update_citations()


# loop over all cites and post comments to wordpress, when necessary
# update database

update_wp_comments()
# Todo: deactivate comments for scopus posts!!!


# Display summary
end = datetime.datetime.now()
print ""
print "Summary: (see also logfile %s) " % log_file
print "Date       = " + str(start)
print "NPubs      = " + str(npubs)
print "NNewPubs   = " + str(nnewpubs)
print "NCites     = " + str(ncites)
print "NNewCites  = " + str(nnewcites)
print "Runtime    = " + str(end - start)


# Write summary to log file
if not os.path.isfile(log_file):
    print "Create logfile " + log_file
    # Open file and write header
    log = open(log_file,"w")
    log.write(__file__ + "\n")
    log.write("\n")
    log.write("      Date\t    Time\tNPubs\tNNewP\tNCite\tNNewC\t          TRun\n")
    log.write("------------------------------------------------------------------------------\n")
    log.close()


log = open(log_file,"a")
log.write("%s\t%s\t%5d\t%5d\t%5d\t%5d\t%s\n" % (start.strftime("%Y-%m-%d"),
                                    start.strftime("%H:%M:%S"),
                                    npubs,nnewpubs,ncites,nnewcites,str(end-start)))
log.close()


# done