Browse Source

Added script to remove unused publications from the database

Andreas Kopmann 7 years ago
parent
commit
b40eb84e14
8 changed files with 516 additions and 2 deletions
  1. 11 0
      README.txt
  2. 18 0
      ak_wordpress.py
  3. 4 2
      scopus-get-publications.py
  4. 107 0
      scopus-update-database.py
  5. 265 0
      test-scopus.py
  6. 16 0
      test-scopus2.py
  7. 57 0
      test-wp.py
  8. 38 0
      test-wp2.py

+ 11 - 0
README.txt

@@ -6,6 +6,15 @@ Get information from Scopus database.
 This queries work only with access to Scopus (e.g. from KIT LAN)
 Scopus service is not public vailable.
 
+Version history
+
+Version 1.0, 8.3.17 (ak):
+- initial version of a single script without any options
+	It runs in 4 phases: get publiations for individual author groups,
+	create posts, get all citations, create comments. 
+- used with the test installation at the UFO server in March 2017
+ 
+
 
 Content
 info		Documentation, website, etc
@@ -19,6 +28,8 @@ scopus-get-publications.py Script to query Scopus
 test-scopus.py	Application with some functions to get publication entries
 		Prints a list with some formatting
 test-scopus2.py Example from one of the website, only one query
+test-wp.py	Test script for access to the wordpress API	
+test-wp2.py 	Test script for wordpress - only query, no modification
 
 
 Usage: 

+ 18 - 0
ak_wordpress.py

@@ -15,6 +15,24 @@ from wordpress_xmlrpc.methods.comments import NewComment, EditComment
 wp = Client('http://localhost/~kopmann/ufo2/xmlrpc.php', 'scopus', '$scopus$')
 
 
+#
+# query post
+#
+
+def wordpress_get_post(wpid):
+    
+    try:
+        post = wp.call(GetPost(wpid))
+        #print post.title
+    
+        ret = 1
+    except:
+        #print "Post seems to be not available"
+        ret = 0
+
+    return ret
+
+
 #
 # create a post from a scopus query
 #

+ 4 - 2
scopus-get-publications.py

@@ -25,7 +25,7 @@ from my_scopus import ak, csa, pdv, ufo, ufo_pdv, ufo_ips, ufo_eps, ufo_apps
 from ak_scopus import get_scopus_list, get_scopus_data, get_scopus_refs
 
 
-from ak_wordpress import wordpress_post_by_scopus, wordpress_comment_by_scopus
+from ak_wordpress import wordpress_post_by_scopus, wordpress_comment_by_scopus, wordpress_get_post
 
 
 # Mysql persistent data (Accout: scopus, $scopus$)
@@ -48,6 +48,7 @@ nnewcites = 0
 
 
 
+# Read publications of a list of authors and store in the database
 def update_publications(authids,authname='',scopus_opts = '',max=0):
 
 
@@ -127,7 +128,7 @@ def update_citations():
     try:
         with connection.cursor() as cursor:
             # Read a single record
-            sql = "SELECT wpid,eid,citedbycount,citesloaded FROM publications"
+            sql = "SELECT wpid,eid,citedbycount,citesloaded FROM publications WHERE wpid > 0"
             cursor.execute(sql)
             result = cursor.fetchall()
           
@@ -346,6 +347,7 @@ print ""
 # Todo: Detect, if there is no access to scopus !!!
 #
 
+
 # Define the author, that should be considered
 #authors = ["Computing", ufo_pdv]
 #print authors

+ 107 - 0
scopus-update-database.py

@@ -0,0 +1,107 @@
+# Get new publications
+# Publication and citations retrieval
+# A. Kopmann, 12.4.17 (ak)
+#
+# Scope:
+# Synchronize publication and citation database with
+# the actual state of posts in wordpress.
+#
+# Post might be deleted, if not fitting to the
+# scope of the website - mark these posts by wpid=0
+# in publication database and drop all citations
+#
+
+import datetime
+import requests
+import json
+import os.path
+
+from ak_wordpress import wordpress_post_by_scopus, wordpress_comment_by_scopus, wordpress_get_post
+
+# Mysql persistent data (Accout: scopus, $scopus$)
+import pymysql.cursors
+import pymysql
+
+db_host = 'localhost'
+db_user = 'scopus'
+db_pw = '$scopus$'
+db_name = 'scopus'
+
+
+# Summary
+npubs = 0
+
+
+# Check if all WP post stored in the database are still existing
+# Todo:
+# - Download full article description
+# - Update categories for author groups in wordpress
+#
+def update_database():
+    
+    print "=== Check validity of publications database "
+    
+    # Connect to the database
+    connection = pymysql.connect(host=db_host,user=db_user,password=db_pw,db=db_name,charset='utf8mb4',cursorclass=pymysql.cursors.DictCursor)
+    
+    # Save all publication to the publication database
+    try:
+        with connection.cursor() as cursor:
+            
+            sql = "SELECT wpid FROM `publications`"
+            cursor.execute(sql)
+            
+            result = cursor.fetchall()
+            
+            for pub in result:
+                id = int(pub['wpid'])
+                
+                
+                if (id > 0):
+                    if not wordpress_get_post(id):
+                        print "Marking post %d as ununsed; deleting citations" % id
+                        
+                        sql = "DELETE FROM `citations` WHERE `wpid` = %s"
+                        cursor.execute(sql, (id) )
+                        
+                        sql = "UPDATE `publications` SET `wpid` = 0 WHERE `wpid` = %s"
+                        cursor.execute(sql, (id) )
+                        npubs += 1
+    
+    
+            # connection is not autocommit by default. So you must commit to save
+            # your changes.
+            connection.commit()
+
+    finally:
+        connection.close()
+
+
+
+
+
+# Main
+
+start = datetime.datetime.now()
+
+print ""
+print "***********************************************"
+print "**** scopus-update-database / " + start.strftime("%Y-%m-%d") + " *****"
+print "***********************************************"
+print ""
+
+
+# Update database
+update_database()
+
+
+# Display summary
+end = datetime.datetime.now()
+print ""
+print "Summary: "
+print "Date          = " + str(start)
+print "NPubs removed = " + str(npubs)
+print "Runtime       = " + str(end - start)
+print
+
+

+ 265 - 0
test-scopus.py

@@ -0,0 +1,265 @@
+# Access Scopus database
+#
+
+import requests
+import json
+from my_scopus import MY_API_KEY
+from my_scopus import ak, pdv, ufo, ufo_ips
+
+
+#
+# Get the last N publications of an given author list
+#
+# Arguments: list of scopus author ids (e.g. "35313939900")
+# Returns: list of scopus article ids ('SCOPUS_ID:0037368024')
+#   The result can be used in the display functions like get_scopus_brief()
+#
+def get_scopus_list(author_list, opt='', n=5):
+    
+    if isinstance(author_list, list):
+        #print "Length of author list %d" % len(author_list)
+        query = ' OR '.join(['AU-ID('+au+')' for au in author_list])
+    else:
+        query = 'AU-ID('+author_list+')'
+
+    if len(opt) > 0:
+        query = query + " AND " + opt
+    
+    #print "Query: " + query
+    url = ("http://api.elsevier.com/content/search/scopus?query="
+           +query+ "&field=dc:identifier&count=" + str(n))
+    #print "URL: " + url
+    resp = requests.get(url,
+                headers={'Accept':'application/json',
+                        'X-ELS-APIKey': MY_API_KEY})
+    #print resp
+    results = resp.json()
+    return [[str(r['dc:identifier'])] for r in results['search-results']["entry"]]
+
+
+
+
+def get_scopus_info(SCOPUS_ID):
+    url = ("http://api.elsevier.com/content/abstract/scopus_id/"
+           + SCOPUS_ID
+           + "?field=article-number,title,publicationName,volume,issueIdentifier,"
+           + "prism:pageRange,coverDate,article-number,doi,citedby-count,prism:aggregationType,url,identifier,description,authors,prism:issn")
+    #print url
+    resp = requests.get(url,
+                headers={'Accept':'application/json',
+                        'X-ELS-APIKey': MY_API_KEY})
+    results = json.loads(resp.text.encode('utf-8'))
+    #print resp
+    print results
+    
+    fstring = '{authors}, {title}, {journal}, {volume}, {articlenum}, ({date}). {doi} (cited {cites} times).\n{abstract}\n\n'
+
+    return fstring.format(authors=', '.join([au['ce:indexed-name'] for au in results['abstracts-retrieval-response']['authors']['author']]),
+                                title=results['abstracts-retrieval-response']['coredata']['dc:title'].encode('utf-8'),
+                                 journal=results['abstracts-retrieval-response']['coredata']['prism:publicationName'].encode('utf-8'),
+                                 volume=results['abstracts-retrieval-response']['coredata']['prism:volume'].encode('utf-8'),
+                                 articlenum=(results['abstracts-retrieval-response']['coredata'].get('prism:pageRange') or
+                                             results['abstracts-retrieval-response']['coredata'].get('article-number')).encode('utf-8'),
+                                 date=results['abstracts-retrieval-response']['coredata']['prism:coverDate'].encode('utf-8'),
+                                 doi='doi:' + results['abstracts-retrieval-response']['coredata']['prism:doi'].encode('utf-8'),
+                                 cites=int(results['abstracts-retrieval-response']['coredata']['citedby-count'].encode('utf-8')),
+                                 abstract=results['abstracts-retrieval-response']['coredata']['dc:description'].encode('utf-8'))
+
+
+#
+# Display a list of publications in plain text format
+#
+# Argement: scopus id of the publication
+#
+# Todo: Implement other formats (e.g. html, bibtex)
+#   Format publications as articles, Title, Abstract
+#
+def get_scopus_brief(SCOPUS_ID, max_authors=1000):
+    id = SCOPUS_ID
+    if isinstance(id, list):
+        id = id[0]
+        
+    url = ("http://api.elsevier.com/content/abstract/scopus_id/"
+           + id
+           + "?field=authors,article-number,title,publicationName,volume,issueIdentifier,"
+           + "prism:pageRange,coverDate,article-number,doi,citedby-count,prism:aggregationType,url,identifier,description,prism:issn")
+
+    #print url
+    resp = requests.get(url,
+                headers={'Accept':'application/json',
+                            'X-ELS-APIKey': MY_API_KEY})
+    results = json.loads(resp.text.encode('utf-8'))
+    #print resp
+    #print results
+    
+    coredata = results['abstracts-retrieval-response']['coredata']
+    
+    pub = ''
+    authors = results['abstracts-retrieval-response']['authors']['author']
+    #print "Number of authors: %d" %len(authors)
+
+    if len(authors) > max_authors:
+        return ''
+
+    if len(authors) > 20:
+        pub = pub + authors[0]['ce:indexed-name'] + ' et.al.: '
+    else:
+        pub = ', '.join([au['ce:indexed-name'] for au in authors]) + ': '
+
+    try:
+        if coredata.get('dc:title'):
+            pub = pub + coredata.get('dc:title').encode('utf-8')
+    except ValueError:
+        print "!!! Error encoding title of publication !!!"
+        #print coredata.get('dc:title')
+        pub = pub + coredata.get('dc:title')
+
+    if coredata.get('prism:publicationName'):
+        pub = pub + ', ' + coredata.get('prism:publicationName').encode('utf-8')
+
+    if coredata.get('prism:volume'):
+        pub = pub + ', ' + coredata.get('prism:volume').encode('utf-8')
+
+    if coredata.get('prism:issueIdentifier'):
+        pub = pub + ', ' + coredata.get('prism:issueIdentifier').encode('utf-8')
+    
+    if coredata.get('prism:coverDate'):
+        pub = pub + ' (' + coredata.get('prism:coverDate').encode('utf-8') + ') '
+
+    if coredata.get('prism:pageRange'):
+        pub = pub + coredata.get('prism:pageRange').encode('utf-8')
+    elif coredata.get('article-number'):
+        pub = pub + coredata.get('article-number').encode('utf-8')
+
+    if coredata.get('prism:doi'):
+        pub = pub + ', doi:' + coredata.get('prism:doi').encode('utf-8')
+
+    if coredata.get('citedby-count'):
+        pub = pub + ' (cited ' + coredata.get('citedby-count').encode('utf-8') + ' times)'
+
+    pub = pub + '.\n'
+
+
+    return pub
+
+
+
+# What kind of lists are interesting for a group website?
+
+
+# Organisation:
+#
+# Latest publications               |  Contact person
+#   List of 3-5 with titles first
+#   First lines of the abtract      | Events:
+#   Number of citations             | Upcoming event
+#   (Blog format)
+#
+# Lately cited papers               | Featured publications
+#   List of 3-5 papers updated cites|
+#   in the last N month
+#   Format: Blog
+#
+#
+
+# List of the latest N publications (N = 3-5)
+# List of the latest publications without big collaboration papers?
+# List of collaboration papers
+# List of publication of the last N month (N = 6-12)
+# List split by sub groups
+
+# List of all publication of one year, that can be selected
+# List of featured publications (selected by a list of Scopus-Ids / or Blog with Scopus-Id)
+
+
+# Problem: If the people are in too many differnent projects
+# How to list only this ones of a certain topic?
+# Is there a subgroup, that is only involved in one project?
+# Try: Leave out ak, csa, we, baumbach, etc?
+
+
+#print "LUMINEU 0\u03bd2\u03b2 project".encode('utf-8')
+
+
+# get number of publications?
+# order by date?
+# limit to certain interval
+
+
+#
+# Save publications with their number of citations in JSON file of SQLITE database?!
+# List of newly cited publications
+#
+
+
+
+resp = requests.get("http://api.elsevier.com/content/author?author_id="+ak+"&view=metrics",
+            headers={'Accept':'application/json',
+                             'X-ELS-APIKey': MY_API_KEY})
+
+#print resp
+
+#print json.dumps(resp.json(),
+#                 sort_keys=True,
+#                 indent=4, separators=(',', ': '))
+
+
+#print get_scopus_info('SCOPUS_ID:0037368024')
+
+#publist = get_scopus_list(ak)
+#publist = get_scopus_list([ak,ak2], 'PUBYEAR = 2014', 30)
+
+#publist = get_scopus_list(pdv, 'PUBYEAR = 2015', 30)
+
+#publist = get_scopus_list(ufo_ipe, 'PUBYEAR = 2015', 30)
+#publist = get_scopus_list(ufo_ips, 'PUBYEAR = 2015', 30)
+
+# Exclude authors?
+publist = get_scopus_list(pdv, 'NOT AU-ID(7006284555)', 10)
+
+
+# Author ausschliessen - black list !!!
+
+
+#
+# Display the result
+#
+print "Number of publications: %d" % len(publist)
+
+for pub in publist:
+    print get_scopus_brief(pub,10000)
+
+
+# Test printing functions
+
+#print publist[2][0]
+#print get_scopus_info('SCOPUS_ID:0037368024')
+#print get_scopus_info(publist[2][0])
+
+
+
+
+# Merge publications
+# Search for all publications? Elimintate dublicates
+# Format abstract
+#
+# There seem to be also preformatted output?
+# Is output in BibTeX possible?
+#
+# Impact-Factor of the journal?
+
+
+
+# Queries:
+
+# List all publications
+# List the lastest publications
+# List list publications of the group wo double entries
+# List latest pubs
+# List publication with higest citation count
+# List publication in magazines with highest impact value
+# Print collaborator network of institutions
+# Print list of collaborations wo large collabs
+
+#
+

+ 16 - 0
test-scopus2.py

@@ -0,0 +1,16 @@
+import requests
+import json
+from my_scopus import MY_API_KEY
+
+resp = requests.get("http://api.elsevier.com/content/search/scopus?query=AU-ID(7004212771)&field=dc:identifier&count=10",
+                    headers={'Accept':'application/json',
+                    'X-ELS-APIKey': MY_API_KEY})
+
+results = resp.json()
+
+print results
+
+#for r in results['search-results']["entry"]
+#    print r
+
+#return [[str(r['dc:identifier'])] for r in results['search-results']["entry"]]

+ 57 - 0
test-wp.py

@@ -0,0 +1,57 @@
+# Test access to Wordpress
+#
+
+# Comment: Quite a usefull API to attach external scripts!!!
+#
+
+from wordpress_xmlrpc import Client
+from wordpress_xmlrpc import WordPressPost, WordPressComment
+from wordpress_xmlrpc.methods.posts import GetPosts, NewPost, EditPost
+from wordpress_xmlrpc.methods.comments import NewComment, EditComment
+from wordpress_xmlrpc.methods.users import GetUserInfo
+
+# Use Wordpress account - not the mysql credentials
+# Todo: use scopus later !!!
+wp = Client('http://localhost/~kopmann/ufo2/xmlrpc.php', 'ufo', '$ipepdv$')
+#print wp.call(GetPosts())
+
+#print wp.call(GetUserInfo())
+
+# Todo: Set the date of the post according to the scopus date
+
+post = WordPressPost()
+post.title = 'My post 7' # put title of the publication here
+post.slug = 'DOIxxxxx7' # set the name of the post different to the title
+post.content = 'This is a more complete example post about XML-RPC (but still not comlete enough)'
+post.id = wp.call(NewPost(post)) # Creates a new post and returns the id!
+
+post.terms_names = {
+    # 'post_tag': ['test', 'firstpost'], # what's that? I don't use it currently
+    'category': ['Publications', 'Reports'] # defined in WP + python script
+}
+
+# whoops, I forgot to publish it!
+post.post_status = 'publish' # alternative is draft here !
+post.comment_status = 'open' # allow comments - may be only for scopus
+wp.call(EditPost(post.id, post))# Update the before created post
+
+# Todo:
+# Save the id in the publication table, together with the with the scopus id
+# and bibtext file
+# Create a table with the defined authors and their publications?! (later)
+
+
+# Todo
+# - try to add a comment to a post !!!
+# - how to identify that a post has been deleted, because the publication
+# does not fit to the scope of the site? E.g. the Auger publications?
+# or my biotech items???
+#
+
+comment = WordPressComment()
+comment.content = 'Hi, thats cool - we can also add our comments automatically'
+
+comment.id = wp.call(NewComment(post.id, comment))
+
+
+

+ 38 - 0
test-wp2.py

@@ -0,0 +1,38 @@
+# Test access to Wordpress
+#
+
+# Comment: Quite a usefull API to attach external scripts!!!
+#
+
+import sys
+
+from wordpress_xmlrpc import Client
+from wordpress_xmlrpc import WordPressPost, WordPressComment
+from wordpress_xmlrpc.methods.posts import GetPost
+
+
+# Access WP post
+# How to detector if it is available???
+
+
+# Use Wordpress account - not the mysql credentials
+# Todo: use scopus later !!!
+wp = Client('http://localhost/~kopmann/ufo2/xmlrpc.php', 'ufo', '$ipepdv$')
+
+
+wpid = 1149
+if len(sys.argv) > 1:
+    wpid = int(sys.argv[1])
+
+
+# Read post
+try:
+    post = wp.call(GetPost(wpid))
+    print "Post %d: %s" %(wpid,post.title)
+
+except:
+    print "Post %d seems to be not available" % wpid
+
+
+
+