Browse Source

Added script to remove unused publications from the database

Andreas Kopmann 7 years ago
parent
commit
b40eb84e14
8 changed files with 516 additions and 2 deletions
  1. 11 0
      README.txt
  2. 18 0
      ak_wordpress.py
  3. 4 2
      scopus-get-publications.py
  4. 107 0
      scopus-update-database.py
  5. 265 0
      test-scopus.py
  6. 16 0
      test-scopus2.py
  7. 57 0
      test-wp.py
  8. 38 0
      test-wp2.py

+ 11 - 0
README.txt

@@ -6,6 +6,15 @@ Get information from Scopus database.
 This queries work only with access to Scopus (e.g. from KIT LAN)
 This queries work only with access to Scopus (e.g. from KIT LAN)
 Scopus service is not public vailable.
 Scopus service is not public vailable.
 
 
+Version history
+
+Version 1.0, 8.3.17 (ak):
+- initial version of a single script without any options
+	It runs in 4 phases: get publiations for individual author groups,
+	create posts, get all citations, create comments. 
+- used with the test installation at the UFO server in March 2017
+ 
+
 
 
 Content
 Content
 info		Documentation, website, etc
 info		Documentation, website, etc
@@ -19,6 +28,8 @@ scopus-get-publications.py Script to query Scopus
 test-scopus.py	Application with some functions to get publication entries
 test-scopus.py	Application with some functions to get publication entries
 		Prints a list with some formatting
 		Prints a list with some formatting
 test-scopus2.py Example from one of the website, only one query
 test-scopus2.py Example from one of the website, only one query
+test-wp.py	Test script for access to the wordpress API	
+test-wp2.py 	Test script for wordpress - only query, no modification
 
 
 
 
 Usage: 
 Usage: 

+ 18 - 0
ak_wordpress.py

@@ -15,6 +15,24 @@ from wordpress_xmlrpc.methods.comments import NewComment, EditComment
 wp = Client('http://localhost/~kopmann/ufo2/xmlrpc.php', 'scopus', '$scopus$')
 wp = Client('http://localhost/~kopmann/ufo2/xmlrpc.php', 'scopus', '$scopus$')
 
 
 
 
+#
+# query post
+#
+
+def wordpress_get_post(wpid):
+    
+    try:
+        post = wp.call(GetPost(wpid))
+        #print post.title
+    
+        ret = 1
+    except:
+        #print "Post seems to be not available"
+        ret = 0
+
+    return ret
+
+
 #
 #
 # create a post from a scopus query
 # create a post from a scopus query
 #
 #

+ 4 - 2
scopus-get-publications.py

@@ -25,7 +25,7 @@ from my_scopus import ak, csa, pdv, ufo, ufo_pdv, ufo_ips, ufo_eps, ufo_apps
 from ak_scopus import get_scopus_list, get_scopus_data, get_scopus_refs
 from ak_scopus import get_scopus_list, get_scopus_data, get_scopus_refs
 
 
 
 
-from ak_wordpress import wordpress_post_by_scopus, wordpress_comment_by_scopus
+from ak_wordpress import wordpress_post_by_scopus, wordpress_comment_by_scopus, wordpress_get_post
 
 
 
 
 # Mysql persistent data (Accout: scopus, $scopus$)
 # Mysql persistent data (Accout: scopus, $scopus$)
@@ -48,6 +48,7 @@ nnewcites = 0
 
 
 
 
 
 
+# Read publications of a list of authors and store in the database
 def update_publications(authids,authname='',scopus_opts = '',max=0):
 def update_publications(authids,authname='',scopus_opts = '',max=0):
 
 
 
 
@@ -127,7 +128,7 @@ def update_citations():
     try:
     try:
         with connection.cursor() as cursor:
         with connection.cursor() as cursor:
             # Read a single record
             # Read a single record
-            sql = "SELECT wpid,eid,citedbycount,citesloaded FROM publications"
+            sql = "SELECT wpid,eid,citedbycount,citesloaded FROM publications WHERE wpid > 0"
             cursor.execute(sql)
             cursor.execute(sql)
             result = cursor.fetchall()
             result = cursor.fetchall()
           
           
@@ -346,6 +347,7 @@ print ""
 # Todo: Detect, if there is no access to scopus !!!
 # Todo: Detect, if there is no access to scopus !!!
 #
 #
 
 
+
 # Define the author, that should be considered
 # Define the author, that should be considered
 #authors = ["Computing", ufo_pdv]
 #authors = ["Computing", ufo_pdv]
 #print authors
 #print authors

+ 107 - 0
scopus-update-database.py

@@ -0,0 +1,107 @@
+# Get new publications
+# Publication and citations retrieval
+# A. Kopmann, 12.4.17 (ak)
+#
+# Scope:
+# Synchronize publication and citation database with
+# the actual state of posts in wordpress.
+#
+# Post might be deleted, if not fitting to the
+# scope of the website - mark these posts by wpid=0
+# in publication database and drop all citations
+#
+
+import datetime
+import requests
+import json
+import os.path
+
+from ak_wordpress import wordpress_post_by_scopus, wordpress_comment_by_scopus, wordpress_get_post
+
+# Mysql persistent data (Accout: scopus, $scopus$)
+import pymysql.cursors
+import pymysql
+
+db_host = 'localhost'
+db_user = 'scopus'
+db_pw = '$scopus$'
+db_name = 'scopus'
+
+
+# Summary
+npubs = 0
+
+
+# Check if all WP post stored in the database are still existing
+# Todo:
+# - Download full article description
+# - Update categories for author groups in wordpress
+#
+def update_database():
+    
+    print "=== Check validity of publications database "
+    
+    # Connect to the database
+    connection = pymysql.connect(host=db_host,user=db_user,password=db_pw,db=db_name,charset='utf8mb4',cursorclass=pymysql.cursors.DictCursor)
+    
+    # Save all publication to the publication database
+    try:
+        with connection.cursor() as cursor:
+            
+            sql = "SELECT wpid FROM `publications`"
+            cursor.execute(sql)
+            
+            result = cursor.fetchall()
+            
+            for pub in result:
+                id = int(pub['wpid'])
+                
+                
+                if (id > 0):
+                    if not wordpress_get_post(id):
+                        print "Marking post %d as ununsed; deleting citations" % id
+                        
+                        sql = "DELETE FROM `citations` WHERE `wpid` = %s"
+                        cursor.execute(sql, (id) )
+                        
+                        sql = "UPDATE `publications` SET `wpid` = 0 WHERE `wpid` = %s"
+                        cursor.execute(sql, (id) )
+                        npubs += 1
+    
+    
+            # connection is not autocommit by default. So you must commit to save
+            # your changes.
+            connection.commit()
+
+    finally:
+        connection.close()
+
+
+
+
+
+# Main
+
+start = datetime.datetime.now()
+
+print ""
+print "***********************************************"
+print "**** scopus-update-database / " + start.strftime("%Y-%m-%d") + " *****"
+print "***********************************************"
+print ""
+
+
+# Update database
+update_database()
+
+
+# Display summary
+end = datetime.datetime.now()
+print ""
+print "Summary: "
+print "Date          = " + str(start)
+print "NPubs removed = " + str(npubs)
+print "Runtime       = " + str(end - start)
+print
+
+

+ 265 - 0
test-scopus.py

@@ -0,0 +1,265 @@
+# Access Scopus database
+#
+
+import requests
+import json
+from my_scopus import MY_API_KEY
+from my_scopus import ak, pdv, ufo, ufo_ips
+
+
+#
+# Get the last N publications of an given author list
+#
+# Arguments: list of scopus author ids (e.g. "35313939900")
+# Returns: list of scopus article ids ('SCOPUS_ID:0037368024')
+#   The result can be used in the display functions like get_scopus_brief()
+#
+def get_scopus_list(author_list, opt='', n=5):
+    
+    if isinstance(author_list, list):
+        #print "Length of author list %d" % len(author_list)
+        query = ' OR '.join(['AU-ID('+au+')' for au in author_list])
+    else:
+        query = 'AU-ID('+author_list+')'
+
+    if len(opt) > 0:
+        query = query + " AND " + opt
+    
+    #print "Query: " + query
+    url = ("http://api.elsevier.com/content/search/scopus?query="
+           +query+ "&field=dc:identifier&count=" + str(n))
+    #print "URL: " + url
+    resp = requests.get(url,
+                headers={'Accept':'application/json',
+                        'X-ELS-APIKey': MY_API_KEY})
+    #print resp
+    results = resp.json()
+    return [[str(r['dc:identifier'])] for r in results['search-results']["entry"]]
+
+
+
+
+def get_scopus_info(SCOPUS_ID):
+    url = ("http://api.elsevier.com/content/abstract/scopus_id/"
+           + SCOPUS_ID
+           + "?field=article-number,title,publicationName,volume,issueIdentifier,"
+           + "prism:pageRange,coverDate,article-number,doi,citedby-count,prism:aggregationType,url,identifier,description,authors,prism:issn")
+    #print url
+    resp = requests.get(url,
+                headers={'Accept':'application/json',
+                        'X-ELS-APIKey': MY_API_KEY})
+    results = json.loads(resp.text.encode('utf-8'))
+    #print resp
+    print results
+    
+    fstring = '{authors}, {title}, {journal}, {volume}, {articlenum}, ({date}). {doi} (cited {cites} times).\n{abstract}\n\n'
+
+    return fstring.format(authors=', '.join([au['ce:indexed-name'] for au in results['abstracts-retrieval-response']['authors']['author']]),
+                                title=results['abstracts-retrieval-response']['coredata']['dc:title'].encode('utf-8'),
+                                 journal=results['abstracts-retrieval-response']['coredata']['prism:publicationName'].encode('utf-8'),
+                                 volume=results['abstracts-retrieval-response']['coredata']['prism:volume'].encode('utf-8'),
+                                 articlenum=(results['abstracts-retrieval-response']['coredata'].get('prism:pageRange') or
+                                             results['abstracts-retrieval-response']['coredata'].get('article-number')).encode('utf-8'),
+                                 date=results['abstracts-retrieval-response']['coredata']['prism:coverDate'].encode('utf-8'),
+                                 doi='doi:' + results['abstracts-retrieval-response']['coredata']['prism:doi'].encode('utf-8'),
+                                 cites=int(results['abstracts-retrieval-response']['coredata']['citedby-count'].encode('utf-8')),
+                                 abstract=results['abstracts-retrieval-response']['coredata']['dc:description'].encode('utf-8'))
+
+
+#
+# Display a list of publications in plain text format
+#
+# Argement: scopus id of the publication
+#
+# Todo: Implement other formats (e.g. html, bibtex)
+#   Format publications as articles, Title, Abstract
+#
+def get_scopus_brief(SCOPUS_ID, max_authors=1000):
+    id = SCOPUS_ID
+    if isinstance(id, list):
+        id = id[0]
+        
+    url = ("http://api.elsevier.com/content/abstract/scopus_id/"
+           + id
+           + "?field=authors,article-number,title,publicationName,volume,issueIdentifier,"
+           + "prism:pageRange,coverDate,article-number,doi,citedby-count,prism:aggregationType,url,identifier,description,prism:issn")
+
+    #print url
+    resp = requests.get(url,
+                headers={'Accept':'application/json',
+                            'X-ELS-APIKey': MY_API_KEY})
+    results = json.loads(resp.text.encode('utf-8'))
+    #print resp
+    #print results
+    
+    coredata = results['abstracts-retrieval-response']['coredata']
+    
+    pub = ''
+    authors = results['abstracts-retrieval-response']['authors']['author']
+    #print "Number of authors: %d" %len(authors)
+
+    if len(authors) > max_authors:
+        return ''
+
+    if len(authors) > 20:
+        pub = pub + authors[0]['ce:indexed-name'] + ' et.al.: '
+    else:
+        pub = ', '.join([au['ce:indexed-name'] for au in authors]) + ': '
+
+    try:
+        if coredata.get('dc:title'):
+            pub = pub + coredata.get('dc:title').encode('utf-8')
+    except ValueError:
+        print "!!! Error encoding title of publication !!!"
+        #print coredata.get('dc:title')
+        pub = pub + coredata.get('dc:title')
+
+    if coredata.get('prism:publicationName'):
+        pub = pub + ', ' + coredata.get('prism:publicationName').encode('utf-8')
+
+    if coredata.get('prism:volume'):
+        pub = pub + ', ' + coredata.get('prism:volume').encode('utf-8')
+
+    if coredata.get('prism:issueIdentifier'):
+        pub = pub + ', ' + coredata.get('prism:issueIdentifier').encode('utf-8')
+    
+    if coredata.get('prism:coverDate'):
+        pub = pub + ' (' + coredata.get('prism:coverDate').encode('utf-8') + ') '
+
+    if coredata.get('prism:pageRange'):
+        pub = pub + coredata.get('prism:pageRange').encode('utf-8')
+    elif coredata.get('article-number'):
+        pub = pub + coredata.get('article-number').encode('utf-8')
+
+    if coredata.get('prism:doi'):
+        pub = pub + ', doi:' + coredata.get('prism:doi').encode('utf-8')
+
+    if coredata.get('citedby-count'):
+        pub = pub + ' (cited ' + coredata.get('citedby-count').encode('utf-8') + ' times)'
+
+    pub = pub + '.\n'
+
+
+    return pub
+
+
+
+# What kind of lists are interesting for a group website?
+
+
+# Organisation:
+#
+# Latest publications               |  Contact person
+#   List of 3-5 with titles first
+#   First lines of the abtract      | Events:
+#   Number of citations             | Upcoming event
+#   (Blog format)
+#
+# Lately cited papers               | Featured publications
+#   List of 3-5 papers updated cites|
+#   in the last N month
+#   Format: Blog
+#
+#
+
+# List of the latest N publications (N = 3-5)
+# List of the latest publications without big collaboration papers?
+# List of collaboration papers
+# List of publication of the last N month (N = 6-12)
+# List split by sub groups
+
+# List of all publication of one year, that can be selected
+# List of featured publications (selected by a list of Scopus-Ids / or Blog with Scopus-Id)
+
+
+# Problem: If the people are in too many differnent projects
+# How to list only this ones of a certain topic?
+# Is there a subgroup, that is only involved in one project?
+# Try: Leave out ak, csa, we, baumbach, etc?
+
+
+#print "LUMINEU 0\u03bd2\u03b2 project".encode('utf-8')
+
+
+# get number of publications?
+# order by date?
+# limit to certain interval
+
+
+#
+# Save publications with their number of citations in JSON file of SQLITE database?!
+# List of newly cited publications
+#
+
+
+
+resp = requests.get("http://api.elsevier.com/content/author?author_id="+ak+"&view=metrics",
+            headers={'Accept':'application/json',
+                             'X-ELS-APIKey': MY_API_KEY})
+
+#print resp
+
+#print json.dumps(resp.json(),
+#                 sort_keys=True,
+#                 indent=4, separators=(',', ': '))
+
+
+#print get_scopus_info('SCOPUS_ID:0037368024')
+
+#publist = get_scopus_list(ak)
+#publist = get_scopus_list([ak,ak2], 'PUBYEAR = 2014', 30)
+
+#publist = get_scopus_list(pdv, 'PUBYEAR = 2015', 30)
+
+#publist = get_scopus_list(ufo_ipe, 'PUBYEAR = 2015', 30)
+#publist = get_scopus_list(ufo_ips, 'PUBYEAR = 2015', 30)
+
+# Exclude authors?
+publist = get_scopus_list(pdv, 'NOT AU-ID(7006284555)', 10)
+
+
+# Author ausschliessen - black list !!!
+
+
+#
+# Display the result
+#
+print "Number of publications: %d" % len(publist)
+
+for pub in publist:
+    print get_scopus_brief(pub,10000)
+
+
+# Test printing functions
+
+#print publist[2][0]
+#print get_scopus_info('SCOPUS_ID:0037368024')
+#print get_scopus_info(publist[2][0])
+
+
+
+
+# Merge publications
+# Search for all publications? Elimintate dublicates
+# Format abstract
+#
+# There seem to be also preformatted output?
+# Is output in BibTeX possible?
+#
+# Impact-Factor of the journal?
+
+
+
+# Queries:
+
+# List all publications
+# List the lastest publications
+# List list publications of the group wo double entries
+# List latest pubs
+# List publication with higest citation count
+# List publication in magazines with highest impact value
+# Print collaborator network of institutions
+# Print list of collaborations wo large collabs
+
+#
+

+ 16 - 0
test-scopus2.py

@@ -0,0 +1,16 @@
+import requests
+import json
+from my_scopus import MY_API_KEY
+
+resp = requests.get("http://api.elsevier.com/content/search/scopus?query=AU-ID(7004212771)&field=dc:identifier&count=10",
+                    headers={'Accept':'application/json',
+                    'X-ELS-APIKey': MY_API_KEY})
+
+results = resp.json()
+
+print results
+
+#for r in results['search-results']["entry"]
+#    print r
+
+#return [[str(r['dc:identifier'])] for r in results['search-results']["entry"]]

+ 57 - 0
test-wp.py

@@ -0,0 +1,57 @@
+# Test access to Wordpress
+#
+
+# Comment: Quite a usefull API to attach external scripts!!!
+#
+
+from wordpress_xmlrpc import Client
+from wordpress_xmlrpc import WordPressPost, WordPressComment
+from wordpress_xmlrpc.methods.posts import GetPosts, NewPost, EditPost
+from wordpress_xmlrpc.methods.comments import NewComment, EditComment
+from wordpress_xmlrpc.methods.users import GetUserInfo
+
+# Use Wordpress account - not the mysql credentials
+# Todo: use scopus later !!!
+wp = Client('http://localhost/~kopmann/ufo2/xmlrpc.php', 'ufo', '$ipepdv$')
+#print wp.call(GetPosts())
+
+#print wp.call(GetUserInfo())
+
+# Todo: Set the date of the post according to the scopus date
+
+post = WordPressPost()
+post.title = 'My post 7' # put title of the publication here
+post.slug = 'DOIxxxxx7' # set the name of the post different to the title
+post.content = 'This is a more complete example post about XML-RPC (but still not comlete enough)'
+post.id = wp.call(NewPost(post)) # Creates a new post and returns the id!
+
+post.terms_names = {
+    # 'post_tag': ['test', 'firstpost'], # what's that? I don't use it currently
+    'category': ['Publications', 'Reports'] # defined in WP + python script
+}
+
+# whoops, I forgot to publish it!
+post.post_status = 'publish' # alternative is draft here !
+post.comment_status = 'open' # allow comments - may be only for scopus
+wp.call(EditPost(post.id, post))# Update the before created post
+
+# Todo:
+# Save the id in the publication table, together with the with the scopus id
+# and bibtext file
+# Create a table with the defined authors and their publications?! (later)
+
+
+# Todo
+# - try to add a comment to a post !!!
+# - how to identify that a post has been deleted, because the publication
+# does not fit to the scope of the site? E.g. the Auger publications?
+# or my biotech items???
+#
+
+comment = WordPressComment()
+comment.content = 'Hi, thats cool - we can also add our comments automatically'
+
+comment.id = wp.call(NewComment(post.id, comment))
+
+
+

+ 38 - 0
test-wp2.py

@@ -0,0 +1,38 @@
+# Test access to Wordpress
+#
+
+# Comment: Quite a usefull API to attach external scripts!!!
+#
+
+import sys
+
+from wordpress_xmlrpc import Client
+from wordpress_xmlrpc import WordPressPost, WordPressComment
+from wordpress_xmlrpc.methods.posts import GetPost
+
+
+# Access WP post
+# How to detector if it is available???
+
+
+# Use Wordpress account - not the mysql credentials
+# Todo: use scopus later !!!
+wp = Client('http://localhost/~kopmann/ufo2/xmlrpc.php', 'ufo', '$ipepdv$')
+
+
+wpid = 1149
+if len(sys.argv) > 1:
+    wpid = int(sys.argv[1])
+
+
+# Read post
+try:
+    post = wp.call(GetPost(wpid))
+    print "Post %d: %s" %(wpid,post.title)
+
+except:
+    print "Post %d seems to be not available" % wpid
+
+
+
+