scopus-get-publications.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413
  1. # Get new publications
  2. # Publication and citations retrieval
  3. # A. Kopmann, 6.2.17 (ak)
  4. #
  5. # Scope:
  6. # Publications are once added to wordpressas a post or comment.
  7. # Afterwards scopus will not change or modify anything any more !!!
  8. # Update is completely in the resonsibility of the ufo users
  9. #
  10. # Todo:
  11. # - add mail to author button
  12. # - save full scopus data in the database
  13. #
  14. # Configuration - Scopus
  15. import datetime
  16. import requests
  17. import json
  18. import os.path
  19. from my_scopus import MY_API_KEY
  20. from my_scopus import ak, csa, pdv, ufo, ufo_pdv, ufo_ips, ufo_eps, ufo_apps
  21. from ak_scopus import get_scopus_list, get_scopus_data, get_scopus_refs
  22. from ak_wordpress import wordpress_post_by_scopus, wordpress_comment_by_scopus
  23. # Mysql persistent data (Accout: scopus, $scopus$)
  24. import pymysql.cursors
  25. import pymysql
  26. db_host = 'localhost'
  27. db_user = 'scopus'
  28. db_pw = '$scopus$'
  29. db_name = 'scopus'
  30. log_file = '/Users/kopmann/scopus-publications.log'
  31. # Summary
  32. npubs = 0
  33. nnewpubs= 0
  34. ncites = 0
  35. nnewcites = 0
  36. def update_publications(authids,authname='',scopus_opts = '',max=0):
  37. print "=== Update of publications for the author group: " + authname
  38. #print str(authids)
  39. # Connect to the database
  40. connection = pymysql.connect(host=db_host,user=db_user,password=db_pw,db=db_name,charset='utf8mb4',cursorclass=pymysql.cursors.DictCursor)
  41. # Request all publications of a list of authors (in one query)
  42. # Result: list of records with (scopus ids, eid, citedbycount)
  43. # The citation could be used later also by wordpress (may be via a plugin)
  44. publist = get_scopus_list(authids,scopus_opts,max)
  45. #publist = get_scopus_list(authids, scopus_opts, 3)
  46. #publist = get_scopus_list(authids, '(PUBYEAR AFT 2014)')
  47. print "Total number of publications: %d" % len(publist)
  48. #print publist
  49. # Save all publication to the publication database
  50. try:
  51. with connection.cursor() as cursor:
  52. for pub in publist:
  53. # 1 / Create a new records
  54. #print pub # Todo: strip the prefix SCOPUS_ID?!
  55. sql = "INSERT IGNORE INTO `publications` (`scopusid`,`eid`) VALUES (%s,%s)"
  56. cursor.execute(sql, (pub[0],pub[1]))
  57. sql = "UPDATE `publications` SET `citedbycount` = %s WHERE `scopusid` = %s"
  58. cursor.execute(sql, (pub[2],pub[0]))
  59. # 2 / Add categories
  60. if len(authname) > 0:
  61. catlist = []
  62. sql = "SELECT categories FROM publications WHERE scopusid = %s"
  63. cursor.execute(sql, (pub[0]))
  64. result = cursor.fetchall()
  65. if len(result) > 0:
  66. #print "Categories %s" % result[0]['categories']
  67. cat = result[0]['categories']
  68. try:
  69. catlist = json.loads(cat)
  70. except TypeError:
  71. print("No categories upto now")
  72. if authname not in catlist:
  73. catlist += [authname]
  74. sql = "UPDATE `publications` SET `categories` = %s WHERE `scopusid` = %s"
  75. cursor.execute(sql, (json.dumps(catlist),pub[0]))
  76. # connection is not autocommit by default. So you must commit to save
  77. # your changes.
  78. connection.commit()
  79. finally:
  80. connection.close()
  81. # Read all citations and store in the citation table
  82. def update_citations():
  83. global npubs
  84. print ""
  85. print "=== Update citatation of all publication in the database"
  86. # Connect to the database
  87. connection = pymysql.connect(host=db_host,user=db_user,password=db_pw,db=db_name,charset='utf8mb4',cursorclass=pymysql.cursors.DictCursor)
  88. # Loop over the publications and read all citations from scopus
  89. # Todo: Shift to a separate script !?
  90. try:
  91. with connection.cursor() as cursor:
  92. # Read a single record
  93. sql = "SELECT wpid,eid,citedbycount,citesloaded FROM publications"
  94. cursor.execute(sql)
  95. result = cursor.fetchall()
  96. print "Total number of publications is %d" % len(result)
  97. npubs = len(result)
  98. #print "Npubs = %d" % npubs
  99. for pub in result:
  100. wpid = int(pub['wpid'])
  101. if pub['citedbycount'] is None:
  102. citedbycount = 0
  103. else:
  104. citedbycount = int(pub['citedbycount'])
  105. if pub['citesloaded'] is None:
  106. citesloaded = 0
  107. else:
  108. citesloaded = int(pub['citesloaded'])
  109. # read list of citations
  110. if pub['eid'] and (citedbycount > citesloaded):
  111. print "Processing %d = %s previously cited by %d" % (wpid, pub['eid'], citesloaded)
  112. data = get_scopus_refs(pub['eid'])
  113. #print json.dumps(data,sort_keys=True,indent=4, separators=(',', ': '))
  114. n = len(data)
  115. #print "Number of citations loaded for processing %d" % n
  116. #print data
  117. if n > 0:
  118. for pub in data:
  119. #print pub['eid'] + ' ' + pub['dc:title']
  120. try:
  121. pubstr = json.dumps(pub)
  122. except TypeError:
  123. print("Error serializing pub entry")
  124. # save all comments to the database
  125. # wirte complete scopus data of the article !?
  126. sql = "INSERT IGNORE INTO `citations` (`wpid`,`scopusid`,`eid`,`scopusdata`) VALUES (%s,%s,%s,%s)"
  127. cursor.execute(sql, (wpid,pub['dc:identifier'],pub['eid'],pubstr))
  128. connection.commit()
  129. # Update the number of cites for this article
  130. if n > citesloaded:
  131. print "New citations found %d -> %d" %(citesloaded,n)
  132. sql = "UPDATE `publications` SET `citesloaded`=" + str(n) + " WHERE wpid=" + str(wpid)
  133. #print sql
  134. cursor.execute(sql)
  135. connection.commit()
  136. finally:
  137. connection.close()
  138. # Create wordpress posts for all entries that have none
  139. def update_wp_posts():
  140. global nnewpubs
  141. print ""
  142. print "=== Create posts for newly registered publication in scopus"
  143. # Connect to the database
  144. connection = pymysql.connect(host=db_host,user=db_user,password=db_pw,db=db_name,charset='utf8mb4',cursorclass=pymysql.cursors.DictCursor)
  145. # Todo: Shift to a separate script !?
  146. try:
  147. with connection.cursor() as cursor:
  148. # Count all publications
  149. #sql = "SELECT COUNT(id) FROM publications"
  150. #cursor.execute(sql)
  151. #result = cursor.fetchall()
  152. #if len(result) > 0:
  153. #print result[0]['COUNT(id)']
  154. # Read a single record
  155. sql = "SELECT scopusid,categories FROM publications WHERE wpid IS NULL"
  156. cursor.execute(sql)
  157. result = cursor.fetchall()
  158. if len(result) > 0:
  159. print "Number of new publications is %d" % len(result)
  160. nnewpubs = len(result)
  161. else:
  162. print "Nothing new found"
  163. # Retrieve all information required for the wordpress page
  164. for pub in result:
  165. print "Processing " + pub['scopusid'] + " categories " + pub['categories']
  166. data = get_scopus_data(pub['scopusid'])
  167. #print json.dumps(data, sort_keys=True, indent=4, separators=(',', ': '))
  168. # Parse categories
  169. catlist = []
  170. try:
  171. catlist = json.loads(pub['categories'])
  172. except TypeError:
  173. print("No categories specified")
  174. wpid = wordpress_post_by_scopus(data, catlist)
  175. #print wpid
  176. #print pub['scopusid']
  177. # Warning: the resulting string uses double quotes (") so use
  178. # single quotes (') for the sql command
  179. datastr = json.dumps(data)
  180. #print datastr
  181. # Update publication database !!!
  182. with connection.cursor() as cursor:
  183. # Read a single record
  184. #sql = "UPDATE publications SET wpid=" + str(wpid) + ",scopusdata='" + datastr + "' WHERE scopusid = '" + pub['scopusid'] + "'"
  185. sql = "UPDATE publications SET wpid=" + str(wpid) + " WHERE scopusid = '" + pub['scopusid'] + "'"
  186. cursor.execute(sql)
  187. connection.commit()
  188. finally:
  189. connection.close()
  190. def update_wp_comments():
  191. global ncites
  192. global nnewcites
  193. print ""
  194. print "=== Create comments for newly registered citations in scopus"
  195. # Connect to the database
  196. connection = pymysql.connect(host=db_host,user=db_user,password=db_pw,db=db_name,charset='utf8mb4',cursorclass=pymysql.cursors.DictCursor)
  197. # Todo: Shift to a separate script !?
  198. try:
  199. with connection.cursor() as cursor:
  200. # Count all citations
  201. sql = "SELECT COUNT(id) FROM citations"
  202. cursor.execute(sql)
  203. result = cursor.fetchall()
  204. if len(result) > 0:
  205. ncites = result[0]['COUNT(id)']
  206. #print result[0]['COUNT(id)']
  207. # Read a single record
  208. sql = "SELECT id,wpid,scopusdata FROM citations WHERE wpcommentid IS NULL"
  209. cursor.execute(sql)
  210. result = cursor.fetchall()
  211. print "Number of new citations is %d" % len(result)
  212. nnewcites = len(result)
  213. for pub in result:
  214. wpid = int(pub['wpid'])
  215. print "Processing post " + str(wpid)
  216. data = []
  217. try:
  218. data = json.loads(pub['scopusdata'])
  219. except TypeError:
  220. print("Scopus data missing?!")
  221. # If the creation of the comment fails, the wpcommentid 0 is
  222. # written to the database. This means, there is no second try
  223. # to get this citations added.
  224. # All failed comments can be found by searching for wpcommentid = 0
  225. #
  226. wpcommentid = 0
  227. try:
  228. wpcommentid = wordpress_comment_by_scopus(wpid, data)
  229. except:
  230. print "Error: Submission of comment failed"
  231. sql = "UPDATE citations SET wpcommentid=" + str(wpcommentid) + " WHERE id = '" + str(pub['id']) + "'"
  232. cursor.execute(sql)
  233. connection.commit()
  234. finally:
  235. connection.close()
  236. # Todo: Add a script to save the data for all publications in the database!!!
  237. # There was some problem before?!
  238. #
  239. # Todo: Add scripts to check consistence in the database
  240. # and fix problems if detected
  241. # E.g. search for wpcommentid == 0
  242. # Check if, wp posts + comments are still availabe, display
  243. # deleted entries
  244. #
  245. # Main
  246. start = datetime.datetime.now()
  247. print ""
  248. print "***********************************************"
  249. print "**** scopus-get-publications / " + start.strftime("%Y-%m-%d") + " *****"
  250. print "***********************************************"
  251. print ""
  252. # Update publaction database; search for new publications
  253. # Loop over all user groups defined in ak_scopus.py
  254. # Todo: Detect, if there is no access to scopus !!!
  255. #
  256. # Define the author, that should be considered
  257. #authors = ["Computing", ufo_pdv]
  258. #print authors
  259. #update_publications(ufo_pdv, "Computing", '(PUBYEAR AFT 2006)',10)
  260. update_publications(ufo_pdv, "Computing", '(PUBYEAR AFT 2006)')
  261. update_publications(ufo_ips, "X-ray Imaging", '(PUBYEAR AFT 2010)')
  262. update_publications(ufo_eps, "Electronics", '(PUBYEAR AFT 2010)')
  263. update_publications(ufo_apps, "Morphology", '(PUBYEAR AFT 2010)')
  264. update_wp_posts()
  265. # read all citations
  266. # Todo: read only new citations?!
  267. update_citations()
  268. # loop over all cites and post comments to wordpress, when necessary
  269. # update database
  270. update_wp_comments()
  271. # Todo: deactivate comments for scopus posts!!!
  272. # Display summary
  273. end = datetime.datetime.now()
  274. print ""
  275. print "Summary: (see also logfile %s) " % log_file
  276. print "Date = " + str(start)
  277. print "NPubs = " + str(npubs)
  278. print "NNewPubs = " + str(nnewpubs)
  279. print "NCites = " + str(ncites)
  280. print "NNewCites = " + str(nnewcites)
  281. print "Runtime = " + str(end - start)
  282. # Write summary to log file
  283. if not os.path.isfile(log_file):
  284. print "Create logfile " + log_file
  285. # Open file and write header
  286. log = open(log_file,"w")
  287. log.write(__file__ + "\n")
  288. log.write("\n")
  289. log.write(" Date\t Time\tNPubs\tNNewP\tNCite\tNNewC\t TRun\n")
  290. log.write("------------------------------------------------------------------------------\n")
  291. log.close()
  292. log = open(log_file,"a")
  293. log.write("%s\t%s\t%5d\t%5d\t%5d\t%5d\t%s\n" % (start.strftime("%Y-%m-%d"),
  294. start.strftime("%H:%M:%S"),
  295. npubs,nnewpubs,ncites,nnewcites,str(end-start)))
  296. log.close()
  297. # done