scopus-get-publications.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402
  1. # Get new publications
  2. # Publication and citations retrieval
  3. # A. Kopmann, 6.2.17 (ak)
  4. #
  5. # Scope:
  6. # Publications are once added to wordpressas a post or comment.
  7. # Afterwards scopus will not change or modify anything any more !!!
  8. # Update is completely in the resonsibility of the ufo users
  9. #
  10. # Todo:
  11. # - add mail to author button
  12. # - save full scopus data in the database
  13. #
  14. # Configuration - Scopus
  15. import datetime
  16. import requests
  17. import json
  18. import os.path
  19. #from my_scopus import MY_API_KEY
  20. #from my_scopus import ak, csa, pdv, ufo, ufo_pdv, ufo_ips, ufo_eps, ufo_apps
  21. from ak_scopus import get_scopus_list, get_scopus_data, get_scopus_refs
  22. from ak_wordpress import wordpress_post_by_scopus, wordpress_comment_by_scopus, wordpress_get_post
  23. # Mysql persistent data (Accout: scopus, $scopus$)
  24. import pymysql.cursors
  25. import pymysql
  26. from config import *
  27. # Summary
  28. npubs = 0
  29. nnewpubs= 0
  30. ncites = 0
  31. nnewcites = 0
  32. # Read publications of a list of authors and store in the database
  33. def update_publications(authids,authname='',scopus_opts = '',max=0):
  34. print "=== Update of publications for the author group: " + authname
  35. #print str(authids)
  36. # Connect to the database
  37. connection = pymysql.connect(host=db_host,user=db_user,password=db_pw,db=db_name,charset='utf8mb4',cursorclass=pymysql.cursors.DictCursor)
  38. # Request all publications of a list of authors (in one query)
  39. # Result: list of records with (scopus ids, eid, citedbycount)
  40. # The citation could be used later also by wordpress (may be via a plugin)
  41. publist = get_scopus_list(authids,scopus_opts,max)
  42. #publist = get_scopus_list(authids, scopus_opts, 3)
  43. #publist = get_scopus_list(authids, '(PUBYEAR AFT 2014)')
  44. print "Total number of publications: %d" % len(publist)
  45. #print publist
  46. # Save all publication to the publication database
  47. try:
  48. with connection.cursor() as cursor:
  49. for pub in publist:
  50. # 1 / Create a new records
  51. #print pub # Todo: strip the prefix SCOPUS_ID?!
  52. sql = "INSERT IGNORE INTO `publications` (`scopusid`,`eid`) VALUES (%s,%s)"
  53. cursor.execute(sql, (pub[0],pub[1]))
  54. sql = "UPDATE `publications` SET `citedbycount` = %s WHERE `scopusid` = %s"
  55. cursor.execute(sql, (pub[2],pub[0]))
  56. # 2 / Add categories
  57. if len(authname) > 0:
  58. catlist = []
  59. sql = "SELECT categories FROM publications WHERE scopusid = %s"
  60. cursor.execute(sql, (pub[0]))
  61. result = cursor.fetchall()
  62. if len(result) > 0:
  63. #print "Categories %s" % result[0]['categories']
  64. cat = result[0]['categories']
  65. try:
  66. catlist = json.loads(cat)
  67. except TypeError:
  68. print("No categories upto now")
  69. if authname not in catlist:
  70. catlist += [authname]
  71. sql = "UPDATE `publications` SET `categories` = %s WHERE `scopusid` = %s"
  72. cursor.execute(sql, (json.dumps(catlist),pub[0]))
  73. # connection is not autocommit by default. So you must commit to save
  74. # your changes.
  75. connection.commit()
  76. finally:
  77. connection.close()
  78. # Read all citations and store in the citation table
  79. def update_citations():
  80. global npubs
  81. print ""
  82. print "=== Update citatation of all publication in the database"
  83. # Connect to the database
  84. connection = pymysql.connect(host=db_host,user=db_user,password=db_pw,db=db_name,charset='utf8mb4',cursorclass=pymysql.cursors.DictCursor)
  85. # Loop over the publications and read all citations from scopus
  86. # Todo: Shift to a separate script !?
  87. try:
  88. with connection.cursor() as cursor:
  89. # Read a single record
  90. sql = "SELECT wpid,eid,citedbycount,citesloaded FROM publications WHERE wpid > 0"
  91. cursor.execute(sql)
  92. result = cursor.fetchall()
  93. print "Total number of publications is %d" % len(result)
  94. npubs = len(result)
  95. #print "Npubs = %d" % npubs
  96. for pub in result:
  97. wpid = int(pub['wpid'])
  98. if pub['citedbycount'] is None:
  99. citedbycount = 0
  100. else:
  101. citedbycount = int(pub['citedbycount'])
  102. if pub['citesloaded'] is None:
  103. citesloaded = 0
  104. else:
  105. citesloaded = int(pub['citesloaded'])
  106. # read list of citations
  107. if pub['eid'] and (citedbycount > citesloaded):
  108. print "Processing %d = %s previously cited by %d" % (wpid, pub['eid'], citesloaded)
  109. data = get_scopus_refs(pub['eid'])
  110. #print json.dumps(data,sort_keys=True,indent=4, separators=(',', ': '))
  111. n = len(data)
  112. #print "Number of citations loaded for processing %d" % n
  113. #print data
  114. if n > 0:
  115. for pub in data:
  116. #print pub['eid'] + ' ' + pub['dc:title']
  117. try:
  118. pubstr = json.dumps(pub)
  119. except TypeError:
  120. print("Error serializing pub entry")
  121. # save all comments to the database
  122. # wirte complete scopus data of the article !?
  123. sql = "INSERT IGNORE INTO `citations` (`wpid`,`scopusid`,`eid`,`scopusdata`) VALUES (%s,%s,%s,%s)"
  124. cursor.execute(sql, (wpid,pub['dc:identifier'],pub['eid'],pubstr))
  125. connection.commit()
  126. # Update the number of cites for this article
  127. if n > citesloaded:
  128. print "New citations found %d -> %d" %(citesloaded,n)
  129. sql = "UPDATE `publications` SET `citesloaded`=" + str(n) + " WHERE wpid=" + str(wpid)
  130. #print sql
  131. cursor.execute(sql)
  132. connection.commit()
  133. finally:
  134. connection.close()
  135. # Create wordpress posts for all entries that have none
  136. def update_wp_posts():
  137. global nnewpubs
  138. print ""
  139. print "=== Create posts for newly registered publication in scopus"
  140. # Connect to the database
  141. connection = pymysql.connect(host=db_host,user=db_user,password=db_pw,db=db_name,charset='utf8mb4',cursorclass=pymysql.cursors.DictCursor)
  142. # Todo: Shift to a separate script !?
  143. try:
  144. with connection.cursor() as cursor:
  145. # Count all publications
  146. #sql = "SELECT COUNT(id) FROM publications"
  147. #cursor.execute(sql)
  148. #result = cursor.fetchall()
  149. #if len(result) > 0:
  150. #print result[0]['COUNT(id)']
  151. # Read a single record
  152. sql = "SELECT scopusid,categories FROM publications WHERE wpid IS NULL"
  153. cursor.execute(sql)
  154. result = cursor.fetchall()
  155. if len(result) > 0:
  156. print "Number of new publications is %d" % len(result)
  157. nnewpubs = len(result)
  158. else:
  159. print "Nothing new found"
  160. # Retrieve all information required for the wordpress page
  161. for pub in result:
  162. print "Processing " + pub['scopusid'] + " categories " + pub['categories']
  163. data = get_scopus_data(pub['scopusid'])
  164. #print json.dumps(data, sort_keys=True, indent=4, separators=(',', ': '))
  165. # Parse categories
  166. catlist = []
  167. try:
  168. catlist = json.loads(pub['categories'])
  169. except TypeError:
  170. print("No categories specified")
  171. wpid = wordpress_post_by_scopus(data, catlist)
  172. #print wpid
  173. #print pub['scopusid']
  174. # Warning: the resulting string uses double quotes (") so use
  175. # single quotes (') for the sql command
  176. datastr = json.dumps(data)
  177. #print datastr
  178. # Update publication database !!!
  179. with connection.cursor() as cursor:
  180. # Read a single record
  181. #sql = "UPDATE publications SET wpid=" + str(wpid) + ",scopusdata='" + datastr + "' WHERE scopusid = '" + pub['scopusid'] + "'"
  182. sql = "UPDATE publications SET wpid=" + str(wpid) + " WHERE scopusid = '" + pub['scopusid'] + "'"
  183. cursor.execute(sql)
  184. connection.commit()
  185. finally:
  186. connection.close()
  187. def update_wp_comments():
  188. global ncites
  189. global nnewcites
  190. print ""
  191. print "=== Create comments for newly registered citations in scopus"
  192. # Connect to the database
  193. connection = pymysql.connect(host=db_host,user=db_user,password=db_pw,db=db_name,charset='utf8mb4',cursorclass=pymysql.cursors.DictCursor)
  194. # Todo: Shift to a separate script !?
  195. try:
  196. with connection.cursor() as cursor:
  197. # Count all citations
  198. sql = "SELECT COUNT(id) FROM citations"
  199. cursor.execute(sql)
  200. result = cursor.fetchall()
  201. if len(result) > 0:
  202. ncites = result[0]['COUNT(id)']
  203. #print result[0]['COUNT(id)']
  204. # Read a single record
  205. sql = "SELECT id,wpid,scopusdata FROM citations WHERE wpcommentid IS NULL"
  206. cursor.execute(sql)
  207. result = cursor.fetchall()
  208. print "Number of new citations is %d" % len(result)
  209. nnewcites = len(result)
  210. for pub in result:
  211. wpid = int(pub['wpid'])
  212. print "Processing post " + str(wpid)
  213. data = []
  214. try:
  215. data = json.loads(pub['scopusdata'])
  216. except TypeError:
  217. print("Scopus data missing?!")
  218. # If the creation of the comment fails, the wpcommentid 0 is
  219. # written to the database. This means, there is no second try
  220. # to get this citations added.
  221. # All failed comments can be found by searching for wpcommentid = 0
  222. #
  223. wpcommentid = 0
  224. try:
  225. wpcommentid = wordpress_comment_by_scopus(wpid, data)
  226. except:
  227. print "Error: Submission of comment failed"
  228. sql = "UPDATE citations SET wpcommentid=" + str(wpcommentid) + " WHERE id = '" + str(pub['id']) + "'"
  229. cursor.execute(sql)
  230. connection.commit()
  231. finally:
  232. connection.close()
  233. # Todo: Add a script to save the data for all publications in the database!!!
  234. # There was some problem before?!
  235. #
  236. # Todo: Add scripts to check consistence in the database
  237. # and fix problems if detected
  238. # E.g. search for wpcommentid == 0
  239. # Check if, wp posts + comments are still availabe, display
  240. # deleted entries
  241. #
  242. # Main
  243. start = datetime.datetime.now()
  244. print ""
  245. print "***********************************************"
  246. print "**** scopus-get-publications / " + start.strftime("%Y-%m-%d") + " *****"
  247. print "***********************************************"
  248. print ""
  249. # Update publaction database; search for new publications
  250. # Loop over all user groups defined in ak_scopus.py
  251. # Todo: Detect, if there is no access to scopus !!!
  252. #
  253. search_param = '(PUBYEAR AFT %d)' % (sc_start)
  254. for wp in sc_workgroups:
  255. update_publications(wp['authors'],wp['name'],search_param)
  256. update_wp_posts()
  257. # read all citations
  258. # Todo: read only new citations?!
  259. update_citations()
  260. # loop over all cites and post comments to wordpress, when necessary
  261. # update database
  262. update_wp_comments()
  263. # Todo: deactivate comments for scopus posts!!!
  264. # Display summary
  265. end = datetime.datetime.now()
  266. print ""
  267. print "Summary: (see also logfile %s) " % log_file
  268. print "Date = " + str(start)
  269. print "NPubs = " + str(npubs)
  270. print "NNewPubs = " + str(nnewpubs)
  271. print "NCites = " + str(ncites)
  272. print "NNewCites = " + str(nnewcites)
  273. print "Runtime = " + str(end - start)
  274. # Write summary to log file
  275. if not os.path.isfile(log_file):
  276. print "Create logfile " + log_file
  277. # Open file and write header
  278. log = open(log_file,"w")
  279. log.write(__file__ + "\n")
  280. log.write("\n")
  281. log.write(" Date\t Time\tNPubs\tNNewP\tNCite\tNNewC\t TRun\n")
  282. log.write("------------------------------------------------------------------------------\n")
  283. log.close()
  284. log = open(log_file,"a")
  285. log.write("%s\t%s\t%5d\t%5d\t%5d\t%5d\t%s\n" % (start.strftime("%Y-%m-%d"),
  286. start.strftime("%H:%M:%S"),
  287. npubs,nnewpubs,ncites,nnewcites,str(end-start)))
  288. log.close()
  289. # done