scopus-get-publications.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415
  1. # Get new publications
  2. # Publication and citations retrieval
  3. # A. Kopmann, 6.2.17 (ak)
  4. #
  5. # Scope:
  6. # Publications are once added to wordpressas a post or comment.
  7. # Afterwards scopus will not change or modify anything any more !!!
  8. # Update is completely in the resonsibility of the ufo users
  9. #
  10. # Todo:
  11. # - add mail to author button
  12. # - save full scopus data in the database
  13. #
  14. # Configuration - Scopus
  15. import datetime
  16. import requests
  17. import json
  18. import os.path
  19. from my_scopus import MY_API_KEY
  20. from my_scopus import ak, csa, pdv, ufo, ufo_pdv, ufo_ips, ufo_eps, ufo_apps
  21. from ak_scopus import get_scopus_list, get_scopus_data, get_scopus_refs
  22. from ak_wordpress import wordpress_post_by_scopus, wordpress_comment_by_scopus, wordpress_get_post
  23. # Mysql persistent data (Accout: scopus, $scopus$)
  24. import pymysql.cursors
  25. import pymysql
  26. db_host = 'localhost'
  27. db_user = 'scopus'
  28. db_pw = '$scopus$'
  29. db_name = 'scopus'
  30. log_file = '/Users/kopmann/scopus-publications.log'
  31. # Summary
  32. npubs = 0
  33. nnewpubs= 0
  34. ncites = 0
  35. nnewcites = 0
  36. # Read publications of a list of authors and store in the database
  37. def update_publications(authids,authname='',scopus_opts = '',max=0):
  38. print "=== Update of publications for the author group: " + authname
  39. #print str(authids)
  40. # Connect to the database
  41. connection = pymysql.connect(host=db_host,user=db_user,password=db_pw,db=db_name,charset='utf8mb4',cursorclass=pymysql.cursors.DictCursor)
  42. # Request all publications of a list of authors (in one query)
  43. # Result: list of records with (scopus ids, eid, citedbycount)
  44. # The citation could be used later also by wordpress (may be via a plugin)
  45. publist = get_scopus_list(authids,scopus_opts,max)
  46. #publist = get_scopus_list(authids, scopus_opts, 3)
  47. #publist = get_scopus_list(authids, '(PUBYEAR AFT 2014)')
  48. print "Total number of publications: %d" % len(publist)
  49. #print publist
  50. # Save all publication to the publication database
  51. try:
  52. with connection.cursor() as cursor:
  53. for pub in publist:
  54. # 1 / Create a new records
  55. #print pub # Todo: strip the prefix SCOPUS_ID?!
  56. sql = "INSERT IGNORE INTO `publications` (`scopusid`,`eid`) VALUES (%s,%s)"
  57. cursor.execute(sql, (pub[0],pub[1]))
  58. sql = "UPDATE `publications` SET `citedbycount` = %s WHERE `scopusid` = %s"
  59. cursor.execute(sql, (pub[2],pub[0]))
  60. # 2 / Add categories
  61. if len(authname) > 0:
  62. catlist = []
  63. sql = "SELECT categories FROM publications WHERE scopusid = %s"
  64. cursor.execute(sql, (pub[0]))
  65. result = cursor.fetchall()
  66. if len(result) > 0:
  67. #print "Categories %s" % result[0]['categories']
  68. cat = result[0]['categories']
  69. try:
  70. catlist = json.loads(cat)
  71. except TypeError:
  72. print("No categories upto now")
  73. if authname not in catlist:
  74. catlist += [authname]
  75. sql = "UPDATE `publications` SET `categories` = %s WHERE `scopusid` = %s"
  76. cursor.execute(sql, (json.dumps(catlist),pub[0]))
  77. # connection is not autocommit by default. So you must commit to save
  78. # your changes.
  79. connection.commit()
  80. finally:
  81. connection.close()
  82. # Read all citations and store in the citation table
  83. def update_citations():
  84. global npubs
  85. print ""
  86. print "=== Update citatation of all publication in the database"
  87. # Connect to the database
  88. connection = pymysql.connect(host=db_host,user=db_user,password=db_pw,db=db_name,charset='utf8mb4',cursorclass=pymysql.cursors.DictCursor)
  89. # Loop over the publications and read all citations from scopus
  90. # Todo: Shift to a separate script !?
  91. try:
  92. with connection.cursor() as cursor:
  93. # Read a single record
  94. sql = "SELECT wpid,eid,citedbycount,citesloaded FROM publications WHERE wpid > 0"
  95. cursor.execute(sql)
  96. result = cursor.fetchall()
  97. print "Total number of publications is %d" % len(result)
  98. npubs = len(result)
  99. #print "Npubs = %d" % npubs
  100. for pub in result:
  101. wpid = int(pub['wpid'])
  102. if pub['citedbycount'] is None:
  103. citedbycount = 0
  104. else:
  105. citedbycount = int(pub['citedbycount'])
  106. if pub['citesloaded'] is None:
  107. citesloaded = 0
  108. else:
  109. citesloaded = int(pub['citesloaded'])
  110. # read list of citations
  111. if pub['eid'] and (citedbycount > citesloaded):
  112. print "Processing %d = %s previously cited by %d" % (wpid, pub['eid'], citesloaded)
  113. data = get_scopus_refs(pub['eid'])
  114. #print json.dumps(data,sort_keys=True,indent=4, separators=(',', ': '))
  115. n = len(data)
  116. #print "Number of citations loaded for processing %d" % n
  117. #print data
  118. if n > 0:
  119. for pub in data:
  120. #print pub['eid'] + ' ' + pub['dc:title']
  121. try:
  122. pubstr = json.dumps(pub)
  123. except TypeError:
  124. print("Error serializing pub entry")
  125. # save all comments to the database
  126. # wirte complete scopus data of the article !?
  127. sql = "INSERT IGNORE INTO `citations` (`wpid`,`scopusid`,`eid`,`scopusdata`) VALUES (%s,%s,%s,%s)"
  128. cursor.execute(sql, (wpid,pub['dc:identifier'],pub['eid'],pubstr))
  129. connection.commit()
  130. # Update the number of cites for this article
  131. if n > citesloaded:
  132. print "New citations found %d -> %d" %(citesloaded,n)
  133. sql = "UPDATE `publications` SET `citesloaded`=" + str(n) + " WHERE wpid=" + str(wpid)
  134. #print sql
  135. cursor.execute(sql)
  136. connection.commit()
  137. finally:
  138. connection.close()
  139. # Create wordpress posts for all entries that have none
  140. def update_wp_posts():
  141. global nnewpubs
  142. print ""
  143. print "=== Create posts for newly registered publication in scopus"
  144. # Connect to the database
  145. connection = pymysql.connect(host=db_host,user=db_user,password=db_pw,db=db_name,charset='utf8mb4',cursorclass=pymysql.cursors.DictCursor)
  146. # Todo: Shift to a separate script !?
  147. try:
  148. with connection.cursor() as cursor:
  149. # Count all publications
  150. #sql = "SELECT COUNT(id) FROM publications"
  151. #cursor.execute(sql)
  152. #result = cursor.fetchall()
  153. #if len(result) > 0:
  154. #print result[0]['COUNT(id)']
  155. # Read a single record
  156. sql = "SELECT scopusid,categories FROM publications WHERE wpid IS NULL"
  157. cursor.execute(sql)
  158. result = cursor.fetchall()
  159. if len(result) > 0:
  160. print "Number of new publications is %d" % len(result)
  161. nnewpubs = len(result)
  162. else:
  163. print "Nothing new found"
  164. # Retrieve all information required for the wordpress page
  165. for pub in result:
  166. print "Processing " + pub['scopusid'] + " categories " + pub['categories']
  167. data = get_scopus_data(pub['scopusid'])
  168. #print json.dumps(data, sort_keys=True, indent=4, separators=(',', ': '))
  169. # Parse categories
  170. catlist = []
  171. try:
  172. catlist = json.loads(pub['categories'])
  173. except TypeError:
  174. print("No categories specified")
  175. wpid = wordpress_post_by_scopus(data, catlist)
  176. #print wpid
  177. #print pub['scopusid']
  178. # Warning: the resulting string uses double quotes (") so use
  179. # single quotes (') for the sql command
  180. datastr = json.dumps(data)
  181. #print datastr
  182. # Update publication database !!!
  183. with connection.cursor() as cursor:
  184. # Read a single record
  185. #sql = "UPDATE publications SET wpid=" + str(wpid) + ",scopusdata='" + datastr + "' WHERE scopusid = '" + pub['scopusid'] + "'"
  186. sql = "UPDATE publications SET wpid=" + str(wpid) + " WHERE scopusid = '" + pub['scopusid'] + "'"
  187. cursor.execute(sql)
  188. connection.commit()
  189. finally:
  190. connection.close()
  191. def update_wp_comments():
  192. global ncites
  193. global nnewcites
  194. print ""
  195. print "=== Create comments for newly registered citations in scopus"
  196. # Connect to the database
  197. connection = pymysql.connect(host=db_host,user=db_user,password=db_pw,db=db_name,charset='utf8mb4',cursorclass=pymysql.cursors.DictCursor)
  198. # Todo: Shift to a separate script !?
  199. try:
  200. with connection.cursor() as cursor:
  201. # Count all citations
  202. sql = "SELECT COUNT(id) FROM citations"
  203. cursor.execute(sql)
  204. result = cursor.fetchall()
  205. if len(result) > 0:
  206. ncites = result[0]['COUNT(id)']
  207. #print result[0]['COUNT(id)']
  208. # Read a single record
  209. sql = "SELECT id,wpid,scopusdata FROM citations WHERE wpcommentid IS NULL"
  210. cursor.execute(sql)
  211. result = cursor.fetchall()
  212. print "Number of new citations is %d" % len(result)
  213. nnewcites = len(result)
  214. for pub in result:
  215. wpid = int(pub['wpid'])
  216. print "Processing post " + str(wpid)
  217. data = []
  218. try:
  219. data = json.loads(pub['scopusdata'])
  220. except TypeError:
  221. print("Scopus data missing?!")
  222. # If the creation of the comment fails, the wpcommentid 0 is
  223. # written to the database. This means, there is no second try
  224. # to get this citations added.
  225. # All failed comments can be found by searching for wpcommentid = 0
  226. #
  227. wpcommentid = 0
  228. try:
  229. wpcommentid = wordpress_comment_by_scopus(wpid, data)
  230. except:
  231. print "Error: Submission of comment failed"
  232. sql = "UPDATE citations SET wpcommentid=" + str(wpcommentid) + " WHERE id = '" + str(pub['id']) + "'"
  233. cursor.execute(sql)
  234. connection.commit()
  235. finally:
  236. connection.close()
  237. # Todo: Add a script to save the data for all publications in the database!!!
  238. # There was some problem before?!
  239. #
  240. # Todo: Add scripts to check consistence in the database
  241. # and fix problems if detected
  242. # E.g. search for wpcommentid == 0
  243. # Check if, wp posts + comments are still availabe, display
  244. # deleted entries
  245. #
  246. # Main
  247. start = datetime.datetime.now()
  248. print ""
  249. print "***********************************************"
  250. print "**** scopus-get-publications / " + start.strftime("%Y-%m-%d") + " *****"
  251. print "***********************************************"
  252. print ""
  253. # Update publaction database; search for new publications
  254. # Loop over all user groups defined in ak_scopus.py
  255. # Todo: Detect, if there is no access to scopus !!!
  256. #
  257. # Define the author, that should be considered
  258. #authors = ["Computing", ufo_pdv]
  259. #print authors
  260. #update_publications(ufo_pdv, "Computing", '(PUBYEAR AFT 2006)',10)
  261. update_publications(ufo_pdv, "Computing", '(PUBYEAR AFT 2006)')
  262. update_publications(ufo_ips, "X-ray Imaging", '(PUBYEAR AFT 2010)')
  263. update_publications(ufo_eps, "Electronics", '(PUBYEAR AFT 2010)')
  264. update_publications(ufo_apps, "Morphology", '(PUBYEAR AFT 2010)')
  265. update_wp_posts()
  266. # read all citations
  267. # Todo: read only new citations?!
  268. update_citations()
  269. # loop over all cites and post comments to wordpress, when necessary
  270. # update database
  271. update_wp_comments()
  272. # Todo: deactivate comments for scopus posts!!!
  273. # Display summary
  274. end = datetime.datetime.now()
  275. print ""
  276. print "Summary: (see also logfile %s) " % log_file
  277. print "Date = " + str(start)
  278. print "NPubs = " + str(npubs)
  279. print "NNewPubs = " + str(nnewpubs)
  280. print "NCites = " + str(ncites)
  281. print "NNewCites = " + str(nnewcites)
  282. print "Runtime = " + str(end - start)
  283. # Write summary to log file
  284. if not os.path.isfile(log_file):
  285. print "Create logfile " + log_file
  286. # Open file and write header
  287. log = open(log_file,"w")
  288. log.write(__file__ + "\n")
  289. log.write("\n")
  290. log.write(" Date\t Time\tNPubs\tNNewP\tNCite\tNNewC\t TRun\n")
  291. log.write("------------------------------------------------------------------------------\n")
  292. log.close()
  293. log = open(log_file,"a")
  294. log.write("%s\t%s\t%5d\t%5d\t%5d\t%5d\t%s\n" % (start.strftime("%Y-%m-%d"),
  295. start.strftime("%H:%M:%S"),
  296. npubs,nnewpubs,ncites,nnewcites,str(end-start)))
  297. log.close()
  298. # done