123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410 |
- """ Publication and citations retrieval
-
- *A. Kopmann, 6.2.17 (ak)*
- Scope:
- Publications are once added to wordpressas a post or comment.
- Afterwards scopus will not change or modify anything any more.
- Update is completely in the resonsibility of the ufo users.
- The operation of the script splits in four phases:
- - Read all publications for one or more author groups
- The groups are all defined in the configuration file
- The publications are stored in a local cache database
- - For all new publication a post in Wordpress is created.
- The post is added to the catogeries accouring to the matching
- author groups
- - For each publication the citations are requested and stored
- in the local cache database as well
- - For each new citation a Wordpress comment is created.
- Todo:
- - add mail to author button
- - save full scopus data in the database
- - Add a script to save the data for all publications in the database!!!
- There was some problem before?!
- - Add scripts to check consistence in the database
- and fix problems if detected
- E.g. search for wpcommentid == 0
- Check if, wp posts + comments are still availabe, display
- deleted entries
- """
- # Configuration - Scopus
- import datetime
- import requests
- import json
- import os.path
- from ak_scopus import get_scopus_list, get_scopus_data, get_scopus_refs
- from ak_wordpress import wordpress_post_by_scopus, wordpress_comment_by_scopus, wordpress_get_post
- # Mysql persistent data (Accout: scopus, $scopus$)
- import pymysql.cursors
- import pymysql
- from config import *
- # Summary
- npubs = 0
- nnewpubs= 0
- ncites = 0
- nnewcites = 0
- def update_publications(authids,authname='',scopus_opts = '',max=0):
- """ Read publications of a list of authors and store in the database """
- print "=== Update of publications for the author group: " + authname
- #print str(authids)
- # Connect to the database
- connection = pymysql.connect(host=db_host,user=db_user,password=db_pw,db=db_name,charset='utf8mb4',cursorclass=pymysql.cursors.DictCursor)
- # Request all publications of a list of authors (in one query)
- # Result: list of records with (scopus ids, eid, citedbycount)
- # The citation could be used later also by wordpress (may be via a plugin)
- publist = get_scopus_list(authids,scopus_opts,max)
- #publist = get_scopus_list(authids, scopus_opts, 3)
- #publist = get_scopus_list(authids, '(PUBYEAR AFT 2014)')
- print "Total number of publications: %d" % len(publist)
- #print publist
- # Save all publication to the publication database
- try:
- with connection.cursor() as cursor:
- for pub in publist:
- # 1 / Create a new records
- #print pub # Todo: strip the prefix SCOPUS_ID?!
- sql = "INSERT IGNORE INTO `publications` (`scopusid`,`eid`) VALUES (%s,%s)"
- cursor.execute(sql, (pub[0],pub[1]))
-
- sql = "UPDATE `publications` SET `citedbycount` = %s WHERE `scopusid` = %s"
- cursor.execute(sql, (pub[2],pub[0]))
- # 2 / Add categories
- if len(authname) > 0:
- catlist = []
- sql = "SELECT categories FROM publications WHERE scopusid = %s"
- cursor.execute(sql, (pub[0]))
- result = cursor.fetchall()
- if len(result) > 0:
- #print "Categories %s" % result[0]['categories']
- cat = result[0]['categories']
- try:
- catlist = json.loads(cat)
- except TypeError:
- print("No categories upto now")
- if authname not in catlist:
- catlist += [authname]
-
- sql = "UPDATE `publications` SET `categories` = %s WHERE `scopusid` = %s"
- cursor.execute(sql, (json.dumps(catlist),pub[0]))
-
- # connection is not autocommit by default. So you must commit to save
- # your changes.
- connection.commit()
- finally:
- connection.close()
- def update_citations():
- """ Read all citations and store in the citation table """
- global npubs
-
- print ""
- print "=== Update citatation of all publication in the database"
-
- # Connect to the database
- connection = pymysql.connect(host=db_host,user=db_user,password=db_pw,db=db_name,charset='utf8mb4',cursorclass=pymysql.cursors.DictCursor)
- # Loop over the publications and read all citations from scopus
- # Todo: Shift to a separate script !?
- try:
- with connection.cursor() as cursor:
- # Read a single record
- sql = "SELECT wpid,eid,citedbycount,citesloaded FROM publications WHERE wpid > 0"
- cursor.execute(sql)
- result = cursor.fetchall()
-
- print "Total number of publications is %d" % len(result)
- npubs = len(result)
- #print "Npubs = %d" % npubs
- for pub in result:
- wpid = int(pub['wpid'])
- if pub['citedbycount'] is None:
- citedbycount = 0
- else:
- citedbycount = int(pub['citedbycount'])
- if pub['citesloaded'] is None:
- citesloaded = 0
- else:
- citesloaded = int(pub['citesloaded'])
- # read list of citations
- if pub['eid'] and (citedbycount > citesloaded):
-
- print "Processing %d = %s previously cited by %d" % (wpid, pub['eid'], citesloaded)
- data = get_scopus_refs(pub['eid'])
- #print json.dumps(data,sort_keys=True,indent=4, separators=(',', ': '))
-
- n = len(data)
- #print "Number of citations loaded for processing %d" % n
- #print data
-
- if n > 0:
- for pub in data:
- #print pub['eid'] + ' ' + pub['dc:title']
-
- try:
- pubstr = json.dumps(pub)
- except TypeError:
- print("Error serializing pub entry")
- # save all comments to the database
- # wirte complete scopus data of the article !?
- sql = "INSERT IGNORE INTO `citations` (`wpid`,`scopusid`,`eid`,`scopusdata`) VALUES (%s,%s,%s,%s)"
- cursor.execute(sql, (wpid,pub['dc:identifier'],pub['eid'],pubstr))
- connection.commit()
- # Update the number of cites for this article
- if n > citesloaded:
- print "New citations found %d -> %d" %(citesloaded,n)
- sql = "UPDATE `publications` SET `citesloaded`=" + str(n) + " WHERE wpid=" + str(wpid)
- #print sql
- cursor.execute(sql)
- connection.commit()
- finally:
- connection.close()
- def update_wp_posts():
- """ Create wordpress posts for all entries that have none """
- global nnewpubs
- print ""
- print "=== Create posts for newly registered publication in scopus"
- # Connect to the database
- connection = pymysql.connect(host=db_host,user=db_user,password=db_pw,db=db_name,charset='utf8mb4',cursorclass=pymysql.cursors.DictCursor)
-
- # Todo: Shift to a separate script !?
- try:
- with connection.cursor() as cursor:
- # Count all publications
- #sql = "SELECT COUNT(id) FROM publications"
- #cursor.execute(sql)
- #result = cursor.fetchall()
- #if len(result) > 0:
- #print result[0]['COUNT(id)']
- # Read a single record
- sql = "SELECT scopusid,categories FROM publications WHERE wpid IS NULL"
- cursor.execute(sql)
- result = cursor.fetchall()
- if len(result) > 0:
- print "Number of new publications is %d" % len(result)
- nnewpubs = len(result)
- else:
- print "Nothing new found"
- # Retrieve all information required for the wordpress page
- for pub in result:
- print "Processing " + pub['scopusid'] + " categories " + pub['categories']
- data = get_scopus_data(pub['scopusid'])
- #print json.dumps(data, sort_keys=True, indent=4, separators=(',', ': '))
- # Parse categories
- catlist = []
- try:
- catlist = json.loads(pub['categories'])
- except TypeError:
- print("No categories specified")
- wpid = wordpress_post_by_scopus(data, catlist)
-
- #print wpid
- #print pub['scopusid']
- # Warning: the resulting string uses double quotes (") so use
- # single quotes (') for the sql command
- datastr = json.dumps(data)
- #print datastr
- # Update publication database !!!
- with connection.cursor() as cursor:
- # Read a single record
- #sql = "UPDATE publications SET wpid=" + str(wpid) + ",scopusdata='" + datastr + "' WHERE scopusid = '" + pub['scopusid'] + "'"
- sql = "UPDATE publications SET wpid=" + str(wpid) + " WHERE scopusid = '" + pub['scopusid'] + "'"
- cursor.execute(sql)
- connection.commit()
- finally:
- connection.close()
- def update_wp_comments():
- """ Create a new comment for newly found citations """
-
- global ncites
- global nnewcites
-
- print ""
- print "=== Create comments for newly registered citations in scopus"
-
-
- # Connect to the database
- connection = pymysql.connect(host=db_host,user=db_user,password=db_pw,db=db_name,charset='utf8mb4',cursorclass=pymysql.cursors.DictCursor)
-
-
- # Todo: Shift to a separate script !?
- try:
- with connection.cursor() as cursor:
- # Count all citations
- sql = "SELECT COUNT(id) FROM citations"
- cursor.execute(sql)
- result = cursor.fetchall()
- if len(result) > 0:
- ncites = result[0]['COUNT(id)']
- #print result[0]['COUNT(id)']
- # Read a single record
- sql = "SELECT id,wpid,scopusdata FROM citations WHERE wpcommentid IS NULL"
- cursor.execute(sql)
- result = cursor.fetchall()
- print "Number of new citations is %d" % len(result)
- nnewcites = len(result)
- for pub in result:
- wpid = int(pub['wpid'])
- print "Processing post " + str(wpid)
-
- data = []
- try:
- data = json.loads(pub['scopusdata'])
- except TypeError:
- print("Scopus data missing?!")
- # If the creation of the comment fails, the wpcommentid 0 is
- # written to the database. This means, there is no second try
- # to get this citations added.
- # All failed comments can be found by searching for wpcommentid = 0
- #
- wpcommentid = 0
- try:
- wpcommentid = wordpress_comment_by_scopus(wpid, data)
- except:
- print "Error: Submission of comment failed"
- sql = "UPDATE citations SET wpcommentid=" + str(wpcommentid) + " WHERE id = '" + str(pub['id']) + "'"
- cursor.execute(sql)
- connection.commit()
- finally:
- connection.close()
- # Main
- # Prevent sphinx from execution
- if __name__ == "__main__":
- start = datetime.datetime.now()
- print ""
- print "***********************************************"
- print "**** scopus-get-publications / " + start.strftime("%Y-%m-%d") + " *****"
- print "***********************************************"
- print ""
- # Update publaction database; search for new publications
- # Loop over all user groups defined in ak_scopus.py
- # Todo: Detect, if there is no access to scopus !!!
- #
- search_param = '(PUBYEAR AFT %d)' % (sc_start)
- for wp in sc_workgroups:
- update_publications(wp['authors'],wp['name'],search_param)
- update_wp_posts()
- # read all citations
- # Todo: read only new citations?!
- update_citations()
- # loop over all cites and post comments to wordpress, when necessary
- # update database
- update_wp_comments()
- # Todo: deactivate comments for scopus posts!!!
- # Display summary
- end = datetime.datetime.now()
- print ""
- print "Summary: (see also logfile %s) " % log_file
- print "Date = " + str(start)
- print "NPubs = " + str(npubs)
- print "NNewPubs = " + str(nnewpubs)
- print "NCites = " + str(ncites)
- print "NNewCites = " + str(nnewcites)
- print "Runtime = " + str(end - start)
- # Write summary to log file
- if not os.path.isfile(log_file):
- print "Create logfile " + log_file
- # Open file and write header
- log = open(log_file,"w")
- log.write(__file__ + "\n")
- log.write("\n")
- log.write(" Date\t Time\tNPubs\tNNewP\tNCite\tNNewC\t TRun\n")
- log.write("------------------------------------------------------------------------------\n")
- log.close()
- log = open(log_file,"a")
- log.write("%s\t%s\t%5d\t%5d\t%5d\t%5d\t%s\n" % (start.strftime("%Y-%m-%d"),
- start.strftime("%H:%M:%S"),
- npubs,nnewpubs,ncites,nnewcites,str(end-start)))
- log.close()
- # done
|