Browse Source

Update of collab filter for IPE site

Andreas Kopmann 5 years ago
parent
commit
224486371f

+ 1 - 0
.gitignore

@@ -1,5 +1,6 @@
 *config.py
 *.pyc
+*.swp
 info/
 scopus/
 scopus-master/

+ 10 - 17
ak_scopus.py

@@ -3,7 +3,6 @@
 
 import requests
 import json
-from pprint import pprint
 
 from config import *
 
@@ -115,9 +114,7 @@ def get_scopus_data(SCOPUS_ID):
     url = ("http://api.elsevier.com/content/abstract/scopus_id/"
            + SCOPUS_ID
            + "?field=article-number,title,publicationName,volume,issueIdentifier,"
-           + "prism:pageRange,coverDate,article-number,eid,doi,citedby-count,"
-	   + "prism:aggregationType,url,identifier,description,authkeywords,"
-	   + "authors,prism:issn,idxterms")
+           + "prism:pageRange,coverDate,article-number,eid,doi,citedby-count,prism:aggregationType,url,identifier,description,authkeywords,authors,prism:issn,authkeywords")
     #print url
     resp = requests.get(url,
             headers={'Accept':'application/json',
@@ -151,23 +148,19 @@ def get_scopus_refs(EID):
         resp = requests.get(url,headers={'Accept':'application/json','X-ELS-APIKey':MY_API_KEY})
 
         results = resp.json()
-        #pprint (json.dumps(results,sort_keys=True,indent=4, separators=(',', ': ')))
+        #print json.dumps(results,sort_keys=True,indent=4, separators=(',', ': '))
 
-        try:
-           if (n==0):
-               n = int(results['search-results']['opensearch:totalResults'])
-               #print "Current number citations in scopus = %d" % n
-               npubstoget = n
+        if (n==0):
+            n = int(results['search-results']['opensearch:totalResults'])
+            #print "Current number citations in scopus = %d" % n
+            npubstoget = n
 
-           if (n>0):
-               publist += results['search-results']['entry']
+        if (n>0):
+                publist += results['search-results']['entry']
 
-           npubstoget = npubstoget - count
-           start += count
+        npubstoget = npubstoget - count
+        start += count
 
-        except:
-	   print "Error:"
-	   pprint (json.dumps(results,sort_keys=True,indent=4, separators=(',', ': ')))
 
     return publist
 

+ 40 - 20
ak_wordpress.py

@@ -12,6 +12,7 @@ from wordpress_xmlrpc import Client
 from wordpress_xmlrpc import WordPressPost, WordPressComment
 from wordpress_xmlrpc.methods.posts import GetPost, NewPost, EditPost
 from wordpress_xmlrpc.methods.comments import NewComment, EditComment
+from wordpress_xmlrpc.methods.taxonomies import GetTerms
 
 from config import *
 
@@ -19,6 +20,19 @@ from config import *
 wp = Client(wp_api_url, wp_user, wp_password) 
 
 
+#
+# Get category from slug name used in the configuration file
+#
+
+def wordpress_get_category(slug):
+     """ Load taxonomy and search for the slug """
+     catlist = wp.call(GetTerms('category'))
+
+     for cat in catlist:
+         if cat.slug == slug:
+             return cat
+
+
 #
 # query post
 #
@@ -43,14 +57,19 @@ def wordpress_get_post(wpid):
 #
 def wordpress_post_by_scopus(data, category = []):
     """ Create a new post based on the Scopus information """
+
+    #print data['abstracts-retrieval-response']
  
     try:
         coredata = data['abstracts-retrieval-response']['coredata']
         authors = data['abstracts-retrieval-response']['authors']['author']
+
     except KeyError:
+        pprint(data)
+        print ""
         print "Have not found authors in dataset"
         print " -> Is the connection to scopus broken???"
-        exit()
+        return(0)
 
     # decode date
     tsstring = coredata['prism:coverDate'].encode('utf-8')
@@ -115,39 +134,40 @@ def wordpress_post_by_scopus(data, category = []):
 
     #print post.content
 
-    post.id = wp.call(NewPost(post)) # Creates a new post and returns the id!
+    #post.id = wp.call(NewPost(post)) # Creates a new post and returns the id!
+
+    catlist = []
+    for slug in category:
+        cat = wordpress_get_category(slug)
+        catlist.append(cat)
+    post.terms = catlist
 
     try:
         taglist = []
-        for tag in data['abstracts-retrieval-response']['idxterms']['mainterm']:
-	    #print tag['$'],type(tag['$'])
+        for tag in data['abstracts-retrieval-response']['authkeywords']['author-keyword']:
+            print "Keyword: ", tag
             taglist.append(tag['$'])
     except:
         pass
 
-    print "Keywords:"
-    print taglist
-
-    if category == '':
-        catlist = ['Publications']
-    else:
-        catlist = ['Publications'] + category
     post.terms_names = {
-	    'post_tag': taglist,
-            'category': catlist # defined in WP + python script
+            'category': ['Publications'],
+            'post_tag': taglist
         }
 
     # whoops, I forgot to publish it!
-    #post.post_status = 'publish' # alternative is draft here !
-    if (len(authors) > 25):
-        post.post_status = 'draft' # alternative is draft here !
-        print "Too many authors %d - might be a collaboration paper!?" % (len(authors))
+    if len(authors) > sc_max_authors:
+        post.post_status = 'draft' # check how to handle publication in wordpress
+        print "Too many authors %d - set to draft" % (len(authors))
     else:
-        post.post_status = 'publish' # alternative is draft here !
-
+        post.post_status = 'publish' # handled as a standard publication
     post.comment_status = 'closed' # allow comments - may be only for scopus
-    wp.call(EditPost(post.id, post))# Update the before created post
 
+    # Todo: this can fail! Add proper error handling 
+    post.id = wp.call(NewPost(post)) # Creates a new post and returns the id!
+    #wp.call(EditPost(post.id, post))# Update the before created post
+
+    
     # need to update the database !!!
     return post.id
 

+ 21 - 14
etc/config_held_de.py

@@ -115,8 +115,15 @@ matthiasKleifegs = "6602072426"
 
 
 # Definition of workgroups for automatic Scopus publication retrieval
-
+# Other parameter define query options. It can be controlled if citations
+# or keywords should be used. The parameter max_authors defines the limit
+# to identify collaborations papers.
+#
 sc_start = 2016
+sc_citations = False
+sc_keywords = True
+sc_max_authors = 25
+
 
 dts_wp11 = [michaelFiederle,dorisEckstein,alexanderDierlamm]
 dts_wp12 = [ulrichTrunk,ivanPeric]
@@ -125,7 +132,7 @@ dts_wp14 = [andreasMussgiller]
 
 dts_wp21 = [marcSchneider,ms2,ms3,ms4]
 dts_wp22 = [peterKaever,matthiasBalzer,oliverSander]
-dts_wp23 = [michaelBussmann,andreasKopmann,surenChilingaryan,matthiasVogelgesang]
+dts_wp23 = [michaelBussmann,andreasKopmann,ak2,surenChilingaryan,matthiasVogelgesang]
 
 dts_wp31 = [davidPennicard]
 dts_wp32 = [berndVoss,oliverSchaefer]
@@ -135,18 +142,18 @@ dts_wp35 = [corneliaWunderer]
 
 
 sc_workgroups = [
-{'name':"Semiconductor sensors",'authors':dts_wp11},
-{'name':"ASICs",'authors':dts_wp12},
-{'name':"Electronics packaging",'authors':dts_wp13},
-{'name':"Innovative materials",'authors':dts_wp14},
-{'name':"Optical data transmission",'authors':dts_wp21},
-{'name':"Programmable electronics",'authors':dts_wp22},
-{'name':"Real-time data processing",'authors':dts_wp23},
-{'name':"Helmholtz cube",'authors':dts_wp31},
-{'name':"Compact gaseous detectors",'authors':dts_wp32},
-{'name':"Photon & X-ray detetors",'authors':dts_wp33},
-{'name':"Fast timing detectors",'authors':dts_wp34},
-{'name':"CMOS sensors",'authors':dts_wp35},
+{'name':"sensors",'authors':dts_wp11},
+{'name':"asics",'authors':dts_wp12},
+{'name':"packaging",'authors':dts_wp13},
+{'name':"materials",'authors':dts_wp14},
+{'name':"photonics",'authors':dts_wp21},
+{'name':"electronics",'authors':dts_wp22},
+{'name':"computing",'authors':dts_wp23},
+{'name':"helmholtz-cube",'authors':dts_wp31},
+{'name':"gaseous-detectors",'authors':dts_wp32},
+{'name':"photon-detectors",'authors':dts_wp33},
+{'name':"fast-timing",'authors':dts_wp34},
+{'name':"cmos-sensors",'authors':dts_wp35},
 ]
 """ Definition of the workgroups
     

+ 7 - 2
etc/config_ufo_kit_edu.py

@@ -32,7 +32,7 @@ log_file = "/root/scopus/log/scopus-publications-ufo-kit-edu.log"
 
 MY_API_KEY = "14d431d052c2caf5e9c4b1ab7de7463d"
 """ Scopus access key (Andreas Kopmann) """
-
+DTS_API_KEY = "f2b35fe46478f22f3c14cf53f73d4f93"
 
 # Scopus author IDs
 
@@ -71,6 +71,7 @@ alexyErshof = "56441809800"
 romanShkarin = "56951331000"
 tiloBaumbach = "7003270957"
 thomasVandekamp = "46761453500"
+danielHaenschke = "55532222200"
 
 # TUD
 michaelHeethoff = "55979397800"
@@ -88,10 +89,14 @@ matthiasKleifegs = "6602072426"
 # Definition of workgroups for automatic Scopus publication retrieval
 
 sc_start = 2010
+sc_citations = False
+sc_keywords = True
+sc_max_authors = 25
+
 
 ufo_pdv = [ak, ak2, csa, matthiasVogelgesang, timoDritschler ]
 ufo_eps = [matthiasBalzer, lorenzoRota, micheleCaselle, mc2 ]
-ufo_ips = [tomyRolo, tr2, tr3, tomasFarago]
+ufo_ips = [tomyRolo, tr2, tr3, tomasFarago, danielHaenschke]
 ufo_apps = [thomasVandekamp]
 ufo_alg = [philipLoesel]
 

+ 13 - 0
log/scopus-publications-held-de.log

@@ -15,3 +15,16 @@
 2018-03-21	18:29:03	  553	    0	 3050	    0	0:00:26.512658
 2018-03-21	18:30:17	  553	    0	 3050	    0	0:00:09.612537
 2018-03-21	18:30:53	  554	    1	 3050	    0	0:00:50.147331
+2018-05-18	09:41:42	  561	    0	     	     	0:00:00.026463
+2018-05-18	09:42:11	  561	   14	     	     	0:02:33.279780
+2018-05-18	09:49:56	  575	    0	     	     	0:00:52.256674
+2018-05-18	10:42:33	   51	    2	     	     	0:00:23.904634
+2018-05-18	11:25:15	   53	  196	     	     	0:22:02.473417
+2018-05-18	14:10:34	  344	  119	     	     	0:07:25.366567
+2018-05-18	14:37:46	  460	    8	     	     	0:01:36.838421
+2018-05-18	15:36:54	  661	  110	     	     	0:07:38.430624
+2018-05-18	17:53:40	  771	  178	     	     	0:19:25.439346
+2018-05-18	20:25:32	  948	    0	     	     	0:03:00.877839
+2018-06-12	23:25:24	  948	   24	     	     	0:09:37.270052
+2018-06-12	23:39:40	  972	    0	     	     	0:01:19.301210
+2018-08-02	13:58:27	  972	   55	     	     	0:07:36.934371

+ 9 - 0
log/scopus-publications-ufo-kit-edu.log

@@ -51,3 +51,12 @@ scopus-get-publications.py
 2018-01-30	16:10:36	  147	    2	  620	   18	0:01:13.152029
 2018-02-08	17:47:46	  147	    0	  624	    4	0:00:17.760916
 2018-03-02	07:40:09	  149	    2	  652	   28	0:01:43.300644
+2018-03-20	11:05:30	  148	    0	  660	    8	0:00:58.301810
+2018-03-20	11:06:32	  148	    0	  660	    0	0:00:10.112241
+2018-03-20	11:50:46	  148	    0	  660	    0	0:00:13.964116
+2018-03-26	10:43:38	  148	    0	  663	    3	0:00:18.869935
+2018-04-16	09:41:47	  149	    1	  667	    4	0:00:33.428451
+2018-04-16	09:52:02	  159	   10	  739	   72	0:03:30.819239
+2018-04-19	19:26:00	  160	    1	  745	    6	0:00:54.086920
+2018-04-23	21:40:42	  160	    0	  745	    0	0:00:10.334309
+2018-04-25	12:36:59	  160	    0	  745	    0	0:00:25.924248

+ 33 - 0
rm-scopusid.py

@@ -0,0 +1,33 @@
+# Remove scopus ID from publication database
+#
+
+import sys
+
+# Mysql persistent data (Accout: scopus, $scopus$)
+import pymysql.cursors
+import pymysql
+
+from config import *
+
+
+if len(sys.argv) > 1:
+
+    scopusid = sys.argv[1]
+
+    # Connect to the database
+    connection = pymysql.connect(host=db_host,user=db_user,password=db_pw,db=db_name,charset='utf8mb4',cursorclass=pymysql.cursors.DictCursor)
+
+    try:
+        with connection.cursor() as cursor:
+
+            sql = "DELETE FROM publications WHERE scopusid=\"%s\" " % scopusid
+            print sql;
+            cursor.execute(sql)
+            connection.commit()
+
+    finally:
+        connection.close()
+
+
+
+

+ 25 - 18
scopus_get_publications.py

@@ -129,9 +129,6 @@ def update_publications(authids,authname='',scopus_opts = '',max=0):
 def update_citations():
     """ Read all citations and store in the citation table """
 
-
-    global npubs
-    
     print ""
     print "=== Update citatation of all publication in the database"
     
@@ -147,10 +144,6 @@ def update_citations():
             sql = "SELECT wpid,eid,citedbycount,citesloaded FROM publications WHERE wpid > 0"
             cursor.execute(sql)
             result = cursor.fetchall()
-          
-            print "Total number of publications is %d" % len(result)
-            npubs = len(result)
-            #print "Npubs = %d" % npubs
 
             for pub in result:
                 wpid = int(pub['wpid'])
@@ -208,7 +201,7 @@ def update_citations():
 def update_wp_posts():
     """ Create wordpress posts for all entries that have none """
 
-
+    global npubs
     global nnewpubs
 
     print ""
@@ -221,6 +214,15 @@ def update_wp_posts():
     # Todo: Shift to a separate script !?
     try:
         with connection.cursor() as cursor:
+            # Read a single record
+            sql = "SELECT wpid,eid,citedbycount,citesloaded FROM publications WHERE wpid > 0"
+            cursor.execute(sql)
+            result = cursor.fetchall()
+
+            print "Total number of publications is %d" % len(result)
+            npubs = len(result)
+            #print "Npubs = %d" % npubs
+
             # Count all publications
             #sql = "SELECT COUNT(id) FROM publications"
             #cursor.execute(sql)
@@ -367,15 +369,13 @@ if __name__ == "__main__":
 
     # read all citations
     # Todo: read only new citations?!
+    if sc_citations:
+        update_citations()
 
-    update_citations()
-
-
-    # loop over all cites and post comments to wordpress, when necessary
-    # update database
+        # loop over all cites and post comments to wordpress, when necessary
+        # update database
 
-    update_wp_comments()
-    # Todo: deactivate comments for scopus posts!!!
+        update_wp_comments()
 
 
     # Display summary
@@ -385,8 +385,9 @@ if __name__ == "__main__":
     print "Date       = " + str(start)
     print "NPubs      = " + str(npubs)
     print "NNewPubs   = " + str(nnewpubs)
-    print "NCites     = " + str(ncites)
-    print "NNewCites  = " + str(nnewcites)
+    if sc_citations:
+        print "NCites     = " + str(ncites)
+        print "NNewCites  = " + str(nnewcites)
     print "Runtime    = " + str(end - start)
 
 
@@ -403,9 +404,15 @@ if __name__ == "__main__":
 
 
     log = open(log_file,"a")
-    log.write("%s\t%s\t%5d\t%5d\t%5d\t%5d\t%s\n" % (start.strftime("%Y-%m-%d"),
+    if sc_citations:
+        log.write("%s\t%s\t%5d\t%5d\t%5d\t%5d\t%s\n" % (start.strftime("%Y-%m-%d"),
                                         start.strftime("%H:%M:%S"),
                                         npubs,nnewpubs,ncites,nnewcites,str(end-start)))
+    else:
+        log.write("%s\t%s\t%5d\t%5d\t%5s\t%5s\t%s\n" % (start.strftime("%Y-%m-%d"),
+                                        start.strftime("%H:%M:%S"),
+                                        npubs,nnewpubs,"","",str(end-start)))
+
     log.close()
 
     # done

+ 31 - 0
test-citations.py

@@ -0,0 +1,31 @@
+import requests
+import json
+from pprint import pprint
+
+from config import *
+
+#
+# Script to check all the afiliations of the authors
+#
+
+SCOPUS_ID = "SCOPUS_ID:85039766090"
+EID = "2-s2.0-84946782439" 
+
+#url = "https://api.elsevier.com/content/search/scopus?query=refeid(" + EID + ")" 
+#url = "https://api.elsevier.com/content/abstract/citations/scopus_id/" + SCOPUS_ID 
+url = "http://api.elsevier.com/content/abstract/scopus_id/" + SCOPUS_ID
+
+
+resp = requests.get(url,headers={'Accept':'application/json','X-ELS-APIKey':DTS_API_KEY})
+
+results = resp.json()
+pprint(results)
+
+
+
+
+
+
+
+
+

+ 62 - 0
test-citations2.py

@@ -0,0 +1,62 @@
+import requests
+import json
+from pprint import pprint
+
+from config import *
+
+#
+# Script to check all the afiliations of the authors
+#
+
+EID = "2-s2.0-84946782439" 
+
+url = "https://api.elsevier.com/content/search/scopus?query=refeid(" + EID + ")" 
+
+resp = requests.get(url,headers={'Accept':'application/json','X-ELS-APIKey':MY_API_KEY})
+
+results = resp.json()
+pprint(results)
+
+exit()
+
+
+
+count = 25
+n = 0
+npubstoget = 25
+start = 0
+ntotal = 0
+publist = []
+
+while (npubstoget > 0):
+
+        loopargs = "&count=%d&start=%d" % (count, start)
+        #print loopargs
+
+        url = ("https://api.elsevier.com/content/search/scopus?query=refeid("
+               + EID + ")" + loopargs)
+
+        print "URL: " + url
+        resp = requests.get(url,headers={'Accept':'application/json','X-ELS-APIKey':MY_API_KEY})
+
+        results = resp.json()
+        pprint(results)
+        #print json.dumps(results,sort_keys=True,indent=4, separators=(',', ': '))
+
+        if (n==0):
+            n = int(results['search-results']['opensearch:totalResults'])
+            #print "Current number citations in scopus = %d" % n
+            npubstoget = n
+
+        if (n>0):
+                publist += results['search-results']['entry']
+
+        npubstoget = npubstoget - count
+        start += count
+
+
+
+
+
+
+

+ 8 - 8
test-scopus.py

@@ -191,11 +191,11 @@ def get_scopus_brief(SCOPUS_ID, max_authors=1000):
 # List of newly cited publications
 #
 
+print get_scopus_info("SCOPUS_ID:84969498463")
 
-
-resp = requests.get("http://api.elsevier.com/content/author?author_id="+ak+"&view=metrics",
-            headers={'Accept':'application/json',
-                             'X-ELS-APIKey': MY_API_KEY})
+#resp = requests.get("http://api.elsevier.com/content/author?author_id="+ak+"&view=metrics",
+#            headers={'Accept':'application/json',
+#                             'X-ELS-APIKey': MY_API_KEY})
 
 #print resp
 
@@ -215,7 +215,7 @@ resp = requests.get("http://api.elsevier.com/content/author?author_id="+ak+"&vie
 #publist = get_scopus_list(ufo_ips, 'PUBYEAR = 2015', 30)
 
 # Exclude authors?
-publist = get_scopus_list(pdv, 'NOT AU-ID(7006284555)', 10)
+#publist = get_scopus_list(pdv, 'NOT AU-ID(7006284555)', 10)
 
 
 # Author ausschliessen - black list !!!
@@ -224,10 +224,10 @@ publist = get_scopus_list(pdv, 'NOT AU-ID(7006284555)', 10)
 #
 # Display the result
 #
-print "Number of publications: %d" % len(publist)
+#print "Number of publications: %d" % len(publist)
 
-for pub in publist:
-    print get_scopus_brief(pub,10000)
+#for pub in publist:
+#    print get_scopus_brief(pub,10000)
 
 
 # Test printing functions

+ 33 - 7
test-wp.py

@@ -9,9 +9,20 @@ from wordpress_xmlrpc import WordPressPost, WordPressComment
 from wordpress_xmlrpc.methods.posts import GetPosts, NewPost, EditPost
 from wordpress_xmlrpc.methods.comments import NewComment, EditComment
 from wordpress_xmlrpc.methods.users import GetUserInfo
+from wordpress_xmlrpc.methods.taxonomies import GetTerms
 
 from config import *
 
+def wordpress_get_category(slug):
+     # Load taxonomy and search for the slug
+     catlist = wp.call(GetTerms('category'))
+
+     for cat in catlist:
+         if cat.slug == slug:
+             return cat
+
+
+
 # Use Wordpress account - not the mysql credentials
 # Todo: use scopus later !!!
 wp = Client(wp_api_url,wp_user,wp_password)
@@ -19,23 +30,38 @@ wp = Client(wp_api_url,wp_user,wp_password)
 
 #print wp.call(GetUserInfo())
 
+# Test access to categories
+categories = []
+cat = wordpress_get_category("sensors")
+
+if cat: 
+    print "Found ", cat.name
+    categories.append(cat)
+
+
 # Todo: Set the date of the post according to the scopus date
 
 post = WordPressPost()
 post.title = 'My post 7' # put title of the publication here
 post.slug = 'DOIxxxxx7' # set the name of the post different to the title
 post.content = 'This is a more complete example post about XML-RPC (but still not comlete enough)'
-post.id = wp.call(NewPost(post)) # Creates a new post and returns the id!
+#post.id = wp.call(NewPost(post)) # Creates a new post and returns the id!
+
+post.terms = categories
 
 post.terms_names = {
-    # 'post_tag': ['test', 'firstpost'], # what's that? I don't use it currently
-    'category': ['Publications', 'Reports'] # defined in WP + python script
+    #'post_tag': ['test', 'firstpost'], # what's that? I don't use it currently
+    'category': ['Publications', 'asics'] # defined in WP + python script
 }
 
 # whoops, I forgot to publish it!
 post.post_status = 'publish' # alternative is draft here !
 post.comment_status = 'open' # allow comments - may be only for scopus
-wp.call(EditPost(post.id, post))# Update the before created post
+post.id = wp.call(NewPost(post)) # Creates a new post and returns the id!
+
+#wp.call(EditPost(post.id, post))# Update the before created post
+
+print "Created Wordpress post ", post.id
 
 # Todo:
 # Save the id in the publication table, together with the with the scopus id
@@ -50,10 +76,10 @@ wp.call(EditPost(post.id, post))# Update the before created post
 # or my biotech items???
 #
 
-comment = WordPressComment()
-comment.content = 'Hi, thats cool - we can also add our comments automatically'
+#comment = WordPressComment()
+#comment.content = 'Hi, thats cool - we can also add our comments automatically'
 
-comment.id = wp.call(NewComment(post.id, comment))
+#comment.id = wp.call(NewComment(post.id, comment))
 
 
 

+ 2 - 2
test-wp2.py

@@ -30,10 +30,10 @@ if len(sys.argv) > 1:
 # Read post
 try:
     post = wp.call(GetPost(wpid))
-    print "Post %d: %s" %(wpid,post.title)
+    print ("Post %d: %s" %(wpid,post.title))
 
 except:
-    print "Post %d seems to be not available" % wpid
+    print ("Post %d seems to be not available" % wpid)
 
 
 

+ 2 - 1
update.sh

@@ -1,6 +1,7 @@
+#!/bin/bash
 # Update publications on UFO webpage by new data in Scopus
 # A Kopmann, 11.4.2017
 #
 
-python -W ignore /root/scopus-held/scopus_get_publications.py
+python -W ignore "`pwd`/scopus_get_publications.py"