A search interface for the Performing Patents Otherwise publication as part of the Politics of Patents case study (part of Copim WP6): this parses data from the archive of RTF files and provides additional data from the European Patent Office OPS API. https://patents.copim.ac.uk
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

145 lines
5.2KB

  1. # @name: solr.py
  2. # @creation_date: 2022-09-07
  3. # @license: The MIT License <https://opensource.org/licenses/MIT>
  4. # @author: Simon Bowie <simon.bowie.19@gmail.com>
  5. # @purpose: Performs Solr functions
  6. # @acknowledgements:
  7. import os
  8. import requests
  9. import re
  10. import urllib
  11. import random
  12. from . import ops
  13. # get config variables from OS environment variables: set in env file passed through Docker Compose
  14. solr_hostname = os.environ.get('SOLR_HOSTNAME')
  15. solr_port = os.environ.get('SOLR_PORT')
  16. def solr_search(core, sort, search=None, id=None):
  17. # Assemble a query string to send to Solr. This uses the Solr hostname from config.env. Solr's query syntax can be found at many sites including https://lucene.apache.org/solr/guide/6_6/the-standard-query-parser.html
  18. if id is not None:
  19. solrurl = 'http://' + solr_hostname + ':' + solr_port + '/solr/' + core + '/select?q.op=OR&q=id%3A"' + id + '"&wt=json'
  20. else:
  21. if (sort == 'relevance'):
  22. solrurl = 'http://' + solr_hostname + ':' + solr_port + '/solr/' + core + '/select?q.op=OR&q=content%3A' + urllib.parse.quote_plus(search) + '&wt=json'
  23. else:
  24. solrurl = 'http://' + solr_hostname + ':' + solr_port + '/solr/' + core + '/select?q.op=OR&q=content%3A' + urllib.parse.quote_plus(search) + '&wt=json&sort=' + sort
  25. # get result
  26. request = requests.get(solrurl)
  27. # turn the API response into useful Json
  28. json = request.json()
  29. if (json['response']['numFound'] == 0):
  30. output = 'no results found'
  31. else:
  32. output = []
  33. for result in json['response']['docs']:
  34. # set ID variable
  35. id = result['id']
  36. # set content variable
  37. content = result['content']
  38. # parse result
  39. result_output = parse_result(id, content)
  40. output.append(result_output)
  41. return output
  42. def parse_result(id, input):
  43. output = {}
  44. output['id'] = id
  45. # set document reference number (used for OPS API)
  46. doc_ref = re.search('=D\s(([^\s]*)\s([^\s]*)\s([^\s]*))', input)
  47. if doc_ref is None:
  48. doc_ref = re.search('=D&locale=en_EP\s(([^\s]*)\s([^\s]*)\s([^\s]*))', input)
  49. output['doc_ref'] = doc_ref.group(1).replace(" ","")
  50. else:
  51. output['doc_ref'] = doc_ref.group(1).replace(" ","")
  52. # search for the application ID in the content element and display it
  53. application_id = re.search('Application.*\n(.*)\n', input)
  54. output['application_id'] = application_id.group(1)
  55. # search for the EPO publication URL in the content element and display it
  56. epo_publication = re.search('Publication.*\n(.*)\n', input)
  57. output['epo_publication_url'] = epo_publication.group(1)
  58. # search for the IPC publication URL in the content element and display it
  59. ipc_publication = re.search('IPC.*\n(.*)\n', input)
  60. output['ipc_publication_url'] = ipc_publication.group(1)
  61. # search for the title in the content element and display it
  62. title = re.search('Title.*\n(.*)\n', input)
  63. if title is not None:
  64. output['title'] = title.group(1)
  65. # search for the abstract in the content element and display it
  66. abstract = re.search('Abstract.*\n(.*)\n', input)
  67. if abstract is None:
  68. abstract = re.search('\(.\) \\n\\n(.*)\\n', input)
  69. if abstract is not None:
  70. output['abstract'] = abstract.group(1);
  71. # search for the year in the content element and display it
  72. year = re.search('=D[^\s]*\s[^\s]*\s[^\s]*\s[^\s]*\s(\d{4})', input)
  73. if year is not None:
  74. output['year'] = year.group(1)
  75. return output
  76. def get_random_record(core):
  77. rand = str(random.randint(0, 9999999))
  78. # Assemble a query string to send to Solr. This uses the Solr hostname from config.env. Solr's query syntax can be found at many sites including https://lucene.apache.org/solr/guide/6_6/the-standard-query-parser.html
  79. solrurl = 'http://' + solr_hostname + ':' + solr_port + '/solr/' + core + '/select?q.op=OR&q=*%3A*&wt=json&sort=random_' + rand + '%20asc&rows=1'
  80. # get result
  81. request = requests.get(solrurl)
  82. # turn the API response into useful Json
  83. json = request.json()
  84. if (json['response']['numFound'] == 0):
  85. output = 'no results found'
  86. else:
  87. output = []
  88. for result in json['response']['docs']:
  89. # set ID variables
  90. id = result['id']
  91. # set content variable
  92. content = result['content']
  93. # parse result
  94. result_output = parse_result(id, content)
  95. output.append(result_output)
  96. return output
  97. def get_ten_random_elements(field):
  98. core = 'all'
  99. output = []
  100. i = 0
  101. while i <= 9:
  102. results = get_random_record(core)
  103. for result in results:
  104. if field in result:
  105. dict = {'id': result['id'], field: result[field]}
  106. output.append(dict)
  107. i += 1
  108. return output
  109. def get_ten_random_images():
  110. core = 'all'
  111. output = []
  112. i = 0
  113. while i <= 9:
  114. results = get_random_record(core)
  115. for result in results:
  116. if ops.get_images(result['doc_ref']):
  117. image = ops.get_images(result['doc_ref'])
  118. result.update(image)
  119. output.append(result)
  120. i += 1
  121. return output