scopus_get_publications.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418
  1. """ Publication and citations retrieval
  2. *A. Kopmann, 6.2.17 (ak)*
  3. Scope:
  4. Publications are once added to wordpressas a post or comment.
  5. Afterwards scopus will not change or modify anything any more.
  6. Update is completely in the resonsibility of the ufo users.
  7. The operation of the script splits in four phases:
  8. - Read all publications for one or more author groups
  9. The groups are all defined in the configuration file
  10. The publications are stored in a local cache database
  11. - For all new publication a post in Wordpress is created.
  12. The post is added to the catogeries accouring to the matching
  13. author groups
  14. - For each publication the citations are requested and stored
  15. in the local cache database as well
  16. - For each new citation a Wordpress comment is created.
  17. Todo:
  18. - add mail to author button
  19. - save full scopus data in the database
  20. - Add a script to save the data for all publications in the database!!!
  21. There was some problem before?!
  22. - Add scripts to check consistence in the database
  23. and fix problems if detected
  24. E.g. search for wpcommentid == 0
  25. Check if, wp posts + comments are still availabe, display
  26. deleted entries
  27. """
  28. # Configuration - Scopus
  29. import datetime
  30. import requests
  31. import json
  32. import os.path
  33. from ak_scopus import get_scopus_list, get_scopus_data, get_scopus_refs
  34. from ak_wordpress import wordpress_post_by_scopus, wordpress_comment_by_scopus, wordpress_get_post
  35. # Mysql persistent data (Accout: scopus, $scopus$)
  36. import pymysql.cursors
  37. import pymysql
  38. from config import *
  39. # Summary
  40. npubs = 0
  41. nnewpubs= 0
  42. ncites = 0
  43. nnewcites = 0
  44. def update_publications(authids,authname='',scopus_opts = '',max=0):
  45. """ Read publications of a list of authors and store in the database """
  46. print "=== Update of publications for the author group: " + authname
  47. #print str(authids)
  48. # Connect to the database
  49. connection = pymysql.connect(host=db_host,user=db_user,password=db_pw,db=db_name,charset='utf8mb4',cursorclass=pymysql.cursors.DictCursor)
  50. # Request all publications of a list of authors (in one query)
  51. # Result: list of records with (scopus ids, eid, citedbycount)
  52. # The citation could be used later also by wordpress (may be via a plugin)
  53. publist = get_scopus_list(authids,scopus_opts,max)
  54. #publist = get_scopus_list(authids, scopus_opts, 3)
  55. #publist = get_scopus_list(authids, '(PUBYEAR AFT 2014)')
  56. print "Total number of publications: %d" % len(publist)
  57. #print publist
  58. # Save all publication to the publication database
  59. try:
  60. with connection.cursor() as cursor:
  61. for pub in publist:
  62. # 1 / Create a new records
  63. #print pub # Todo: strip the prefix SCOPUS_ID?!
  64. sql = "INSERT IGNORE INTO `publications` (`scopusid`,`eid`) VALUES (%s,%s)"
  65. cursor.execute(sql, (pub[0],pub[1]))
  66. sql = "UPDATE `publications` SET `citedbycount` = %s WHERE `scopusid` = %s"
  67. cursor.execute(sql, (pub[2],pub[0]))
  68. # 2 / Add categories
  69. if len(authname) > 0:
  70. catlist = []
  71. sql = "SELECT categories FROM publications WHERE scopusid = %s"
  72. cursor.execute(sql, (pub[0]))
  73. result = cursor.fetchall()
  74. if len(result) > 0:
  75. #print "Categories %s" % result[0]['categories']
  76. cat = result[0]['categories']
  77. try:
  78. catlist = json.loads(cat)
  79. except TypeError:
  80. #print("No categories upto now")
  81. pass
  82. if authname not in catlist:
  83. catlist += [authname]
  84. sql = "UPDATE `publications` SET `categories` = %s WHERE `scopusid` = %s"
  85. cursor.execute(sql, (json.dumps(catlist),pub[0]))
  86. # connection is not autocommit by default. So you must commit to save
  87. # your changes.
  88. connection.commit()
  89. finally:
  90. connection.close()
  91. def update_citations():
  92. """ Read all citations and store in the citation table """
  93. print ""
  94. print "=== Update citatation of all publication in the database"
  95. # Connect to the database
  96. connection = pymysql.connect(host=db_host,user=db_user,password=db_pw,db=db_name,charset='utf8mb4',cursorclass=pymysql.cursors.DictCursor)
  97. # Loop over the publications and read all citations from scopus
  98. # Todo: Shift to a separate script !?
  99. try:
  100. with connection.cursor() as cursor:
  101. # Read a single record
  102. sql = "SELECT wpid,eid,citedbycount,citesloaded FROM publications WHERE wpid > 0"
  103. cursor.execute(sql)
  104. result = cursor.fetchall()
  105. for pub in result:
  106. wpid = int(pub['wpid'])
  107. if pub['citedbycount'] is None:
  108. citedbycount = 0
  109. else:
  110. citedbycount = int(pub['citedbycount'])
  111. if pub['citesloaded'] is None:
  112. citesloaded = 0
  113. else:
  114. citesloaded = int(pub['citesloaded'])
  115. # read list of citations
  116. if pub['eid'] and (citedbycount > citesloaded):
  117. print "Processing %d = %s previously cited by %d" % (wpid, pub['eid'], citesloaded)
  118. data = get_scopus_refs(pub['eid'])
  119. #print json.dumps(data,sort_keys=True,indent=4, separators=(',', ': '))
  120. n = len(data)
  121. #print "Number of citations loaded for processing %d" % n
  122. #print data
  123. if n > 0:
  124. for pub in data:
  125. #print pub['eid'] + ' ' + pub['dc:title']
  126. try:
  127. pubstr = json.dumps(pub)
  128. except TypeError:
  129. print("Error serializing pub entry")
  130. # save all comments to the database
  131. # wirte complete scopus data of the article !?
  132. sql = "INSERT IGNORE INTO `citations` (`wpid`,`scopusid`,`eid`,`scopusdata`) VALUES (%s,%s,%s,%s)"
  133. cursor.execute(sql, (wpid,pub['dc:identifier'],pub['eid'],pubstr))
  134. connection.commit()
  135. # Update the number of cites for this article
  136. if n > citesloaded:
  137. print "New citations found %d -> %d" %(citesloaded,n)
  138. sql = "UPDATE `publications` SET `citesloaded`=" + str(n) + " WHERE wpid=" + str(wpid)
  139. #print sql
  140. cursor.execute(sql)
  141. connection.commit()
  142. finally:
  143. connection.close()
  144. def update_wp_posts():
  145. """ Create wordpress posts for all entries that have none """
  146. global npubs
  147. global nnewpubs
  148. print ""
  149. print "=== Create posts for newly registered publication in scopus"
  150. # Connect to the database
  151. connection = pymysql.connect(host=db_host,user=db_user,password=db_pw,db=db_name,charset='utf8mb4',cursorclass=pymysql.cursors.DictCursor)
  152. # Todo: Shift to a separate script !?
  153. try:
  154. with connection.cursor() as cursor:
  155. # Read a single record
  156. sql = "SELECT wpid,eid,citedbycount,citesloaded FROM publications WHERE wpid > 0"
  157. cursor.execute(sql)
  158. result = cursor.fetchall()
  159. print "Total number of publications is %d" % len(result)
  160. npubs = len(result)
  161. #print "Npubs = %d" % npubs
  162. # Count all publications
  163. #sql = "SELECT COUNT(id) FROM publications"
  164. #cursor.execute(sql)
  165. #result = cursor.fetchall()
  166. #if len(result) > 0:
  167. #print result[0]['COUNT(id)']
  168. # Read a single record
  169. sql = "SELECT scopusid,categories FROM publications WHERE wpid IS NULL"
  170. cursor.execute(sql)
  171. result = cursor.fetchall()
  172. if len(result) > 0:
  173. print "Number of new publications is %d" % len(result)
  174. nnewpubs = len(result)
  175. else:
  176. print "Nothing new found"
  177. # Retrieve all information required for the wordpress page
  178. for pub in result:
  179. print "Processing " + pub['scopusid'] + " categories " + pub['categories']
  180. data = get_scopus_data(pub['scopusid'])
  181. #print json.dumps(data, sort_keys=True, indent=4, separators=(',', ': '))
  182. # Parse categories
  183. catlist = []
  184. try:
  185. catlist = json.loads(pub['categories'])
  186. except TypeError:
  187. print("No categories specified")
  188. wpid = wordpress_post_by_scopus(data, catlist)
  189. #print wpid
  190. #print pub['scopusid']
  191. # Warning: the resulting string uses double quotes (") so use
  192. # single quotes (') for the sql command
  193. datastr = json.dumps(data)
  194. #print datastr
  195. # Update publication database !!!
  196. with connection.cursor() as cursor:
  197. # Read a single record
  198. #sql = "UPDATE publications SET wpid=" + str(wpid) + ",scopusdata='" + datastr + "' WHERE scopusid = '" + pub['scopusid'] + "'"
  199. sql = "UPDATE publications SET wpid=" + str(wpid) + " WHERE scopusid = '" + pub['scopusid'] + "'"
  200. cursor.execute(sql)
  201. connection.commit()
  202. finally:
  203. connection.close()
  204. def update_wp_comments():
  205. """ Create a new comment for newly found citations """
  206. global ncites
  207. global nnewcites
  208. print ""
  209. print "=== Create comments for newly registered citations in scopus"
  210. # Connect to the database
  211. connection = pymysql.connect(host=db_host,user=db_user,password=db_pw,db=db_name,charset='utf8mb4',cursorclass=pymysql.cursors.DictCursor)
  212. # Todo: Shift to a separate script !?
  213. try:
  214. with connection.cursor() as cursor:
  215. # Count all citations
  216. sql = "SELECT COUNT(id) FROM citations"
  217. cursor.execute(sql)
  218. result = cursor.fetchall()
  219. if len(result) > 0:
  220. ncites = result[0]['COUNT(id)']
  221. #print result[0]['COUNT(id)']
  222. # Read a single record
  223. sql = "SELECT id,wpid,scopusdata FROM citations WHERE wpcommentid IS NULL"
  224. cursor.execute(sql)
  225. result = cursor.fetchall()
  226. print "Number of new citations is %d" % len(result)
  227. nnewcites = len(result)
  228. for pub in result:
  229. wpid = int(pub['wpid'])
  230. print "Processing post " + str(wpid)
  231. data = []
  232. try:
  233. data = json.loads(pub['scopusdata'])
  234. except TypeError:
  235. print("Scopus data missing?!")
  236. # If the creation of the comment fails, the wpcommentid 0 is
  237. # written to the database. This means, there is no second try
  238. # to get this citations added.
  239. # All failed comments can be found by searching for wpcommentid = 0
  240. #
  241. wpcommentid = 0
  242. try:
  243. wpcommentid = wordpress_comment_by_scopus(wpid, data)
  244. except:
  245. print "Error: Submission of comment failed"
  246. sql = "UPDATE citations SET wpcommentid=" + str(wpcommentid) + " WHERE id = '" + str(pub['id']) + "'"
  247. cursor.execute(sql)
  248. connection.commit()
  249. finally:
  250. connection.close()
  251. # Main
  252. # Prevent sphinx from execution
  253. if __name__ == "__main__":
  254. start = datetime.datetime.now()
  255. print ""
  256. print "***********************************************"
  257. print "**** scopus-get-publications / " + start.strftime("%Y-%m-%d") + " *****"
  258. print "***********************************************"
  259. print ""
  260. # Update publaction database; search for new publications
  261. # Loop over all user groups defined in ak_scopus.py
  262. # Todo: Detect, if there is no access to scopus !!!
  263. #
  264. search_param = '(PUBYEAR AFT %d)' % (sc_start)
  265. for wp in sc_workgroups:
  266. update_publications(wp['authors'],wp['name'],search_param)
  267. update_wp_posts()
  268. # read all citations
  269. # Todo: read only new citations?!
  270. if sc_citations:
  271. update_citations()
  272. # loop over all cites and post comments to wordpress, when necessary
  273. # update database
  274. update_wp_comments()
  275. # Display summary
  276. end = datetime.datetime.now()
  277. print ""
  278. print "Summary: (see also logfile %s) " % log_file
  279. print "Date = " + str(start)
  280. print "NPubs = " + str(npubs)
  281. print "NNewPubs = " + str(nnewpubs)
  282. if sc_citations:
  283. print "NCites = " + str(ncites)
  284. print "NNewCites = " + str(nnewcites)
  285. print "Runtime = " + str(end - start)
  286. # Write summary to log file
  287. if not os.path.isfile(log_file):
  288. print "Create logfile " + log_file
  289. # Open file and write header
  290. log = open(log_file,"w")
  291. log.write(__file__ + "\n")
  292. log.write("\n")
  293. log.write(" Date\t Time\tNPubs\tNNewP\tNCite\tNNewC\t TRun\n")
  294. log.write("------------------------------------------------------------------------------\n")
  295. log.close()
  296. log = open(log_file,"a")
  297. if sc_citations:
  298. log.write("%s\t%s\t%5d\t%5d\t%5d\t%5d\t%s\n" % (start.strftime("%Y-%m-%d"),
  299. start.strftime("%H:%M:%S"),
  300. npubs,nnewpubs,ncites,nnewcites,str(end-start)))
  301. else:
  302. log.write("%s\t%s\t%5d\t%5d\t%5s\t%5s\t%s\n" % (start.strftime("%Y-%m-%d"),
  303. start.strftime("%H:%M:%S"),
  304. npubs,nnewpubs,"","",str(end-start)))
  305. log.close()
  306. # done