A search interface for the Performing Patents Otherwise publication as part of the Politics of Patents case study (part of Copim WP6): this parses data from the archive of RTF files and provides additional data from the European Patent Office OPS API. https://patents.copim.ac.uk
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

233 line
8.8KB

  1. # @name: solr.py
  2. # @creation_date: 2022-09-07
  3. # @license: The MIT License <https://opensource.org/licenses/MIT>
  4. # @author: Simon Bowie <simon.bowie.19@gmail.com>
  5. # @purpose: Performs Solr functions
  6. # @acknowledgements:
  7. import os
  8. import requests
  9. import re
  10. import urllib
  11. import random
  12. import pycountry
  13. from . import ops
  14. # get config variables from OS environment variables: set in env file passed through Docker Compose
  15. solr_hostname = os.environ.get('SOLR_HOSTNAME')
  16. solr_port = os.environ.get('SOLR_PORT')
  17. def content_search(core, sort, search=None, id=None):
  18. # Assemble a query string to send to Solr. This uses the Solr hostname from config.env. Solr's query syntax can be found at many sites including https://lucene.apache.org/solr/guide/6_6/the-standard-query-parser.html
  19. if id is not None:
  20. solrurl = 'http://' + solr_hostname + ':' + solr_port + '/solr/' + core + '/select?q.op=OR&q=id%3A"' + id + '"&wt=json'
  21. else:
  22. if (sort == 'relevance'):
  23. solrurl = 'http://' + solr_hostname + ':' + solr_port + '/solr/' + core + '/select?q.op=OR&q=content%3A' + urllib.parse.quote_plus(search) + '&wt=json&facet.field=country&facet.field=year&facet.sort=count&facet=true'
  24. else:
  25. solrurl = 'http://' + solr_hostname + ':' + solr_port + '/solr/' + core + '/select?q.op=OR&q=content%3A' + urllib.parse.quote_plus(search) + '&wt=json&sort=' + sort + '&facet.field=country&facet.field=year&facet.sort=count&facet=true'
  26. # get result
  27. request = requests.get(solrurl)
  28. # turn the API response into useful Json
  29. json = request.json()
  30. num_found = json['response']['numFound']
  31. if (num_found == 0):
  32. output = 'no results found'
  33. else:
  34. output = []
  35. for result in json['response']['docs']:
  36. # set ID variable
  37. id = result['id']
  38. # set content variable
  39. content = result['content']
  40. # parse result
  41. result_output = parse_result(id, content)
  42. output.append(result_output)
  43. country_facet = json['facet_counts']['facet_fields']['country']
  44. year_facet = json['facet_counts']['facet_fields']['year']
  45. return output, num_found, country_facet, year_facet
  46. def term_search(core, sort, field, input):
  47. # Assemble a query string to send to Solr. This uses the Solr hostname from config.env. Solr's query syntax can be found at many sites including https://lucene.apache.org/solr/guide/6_6/the-standard-query-parser.html
  48. if (sort == 'relevance'):
  49. solrurl = 'http://' + solr_hostname + ':' + solr_port + '/solr/' + core + '/select?q.op=OR&q=%7B!term%20f%3D' + field + '%7D' + input + '&wt=json'
  50. else:
  51. solrurl = 'http://' + solr_hostname + ':' + solr_port + '/solr/' + core + '/select?q.op=OR&q=%7B!term%20f%3D' + field + '%7D' + input + '&wt=json&sort=' + sort
  52. # get result
  53. request = requests.get(solrurl)
  54. # turn the API response into useful Json
  55. json = request.json()
  56. num_found = json['response']['numFound']
  57. if (num_found == 0):
  58. output = 'no results found'
  59. else:
  60. output = []
  61. for result in json['response']['docs']:
  62. # set ID variable
  63. id = result['id']
  64. # set content variable
  65. content = result['content']
  66. # parse result
  67. result_output = parse_result(id, content)
  68. output.append(result_output)
  69. return output, num_found
  70. def parse_result(id, input):
  71. output = {}
  72. output['id'] = id
  73. # set document reference number (used for OPS API)
  74. doc_ref = re.search('=D\s(([^\s]*)\s([^\s]*)\s([^\s]*))', input)
  75. if doc_ref is None:
  76. doc_ref = re.search('=D&locale=en_EP\s(([^\s]*)\s([^\s]*)\s([^\s]*))', input)
  77. if doc_ref is None:
  78. output['doc_ref'] = ""
  79. else:
  80. output['doc_ref'] = doc_ref.group(1).replace(" ","")
  81. else:
  82. output['doc_ref'] = doc_ref.group(1).replace(" ","")
  83. # search for the application ID in the content element and display it
  84. application_id = re.search('Application.*\n(.*)\n', input)
  85. output['application_id'] = application_id.group(1)
  86. # search for the EPO publication URL in the content element and display it
  87. epo_publication = re.search('Publication.*\n(.*)\n', input)
  88. output['epo_publication_url'] = epo_publication.group(1)
  89. # search for the IPC publication URL in the content element and display it
  90. ipc_publication = re.search('IPC.*\n(.*)\n', input)
  91. output['ipc_publication_url'] = ipc_publication.group(1)
  92. # search for the title in the content element and display it
  93. title = re.search('Title.*?\\n(.*?)\\n|Tile.?\\n(.*?)\\n', input)
  94. if title is not None:
  95. if title.group(1) is not None:
  96. output['title'] = title.group(1)
  97. else:
  98. output['title'] = title.group(2)
  99. # search for the abstract in the content element and display it
  100. abstract = re.search('Abstract.*\n(.*)\n', input)
  101. if abstract is not None:
  102. if abstract.group(1) is not None:
  103. output['abstract'] = abstract.group(1)
  104. else:
  105. abstract = re.search('\(.*?\) (\\n\\n\\n\\n|\\n\\n\\n|\\n\\n)(.*)\\n', input)
  106. if abstract is not None:
  107. if abstract.group(2) is not None:
  108. output['abstract'] = abstract.group(2)
  109. # search for the year in the content element and display it
  110. year = re.search('=D[^\s]*\s[^\s]*\s[^\s]*\s[^\s]*\s(\d{4})', input)
  111. if year is not None:
  112. output['year'] = year.group(1)
  113. # search for the country in the content element and display it
  114. country_code = re.search('FT=D[^\s]*\s(\w{2})', input)
  115. country = pycountry.countries.get(alpha_2=country_code.group(1))
  116. if country is not None:
  117. output['country'] = country
  118. else:
  119. country = pycountry.historic_countries.get(alpha_2=country_code.group(1))
  120. if country is not None:
  121. output['country'] = country
  122. else:
  123. output['country'] = country_code.group(1)
  124. output['raw'] = input
  125. return output
  126. def get_random_record(core):
  127. rand = str(random.randint(0, 9999999))
  128. # Assemble a query string to send to Solr. This uses the Solr hostname from config.env. Solr's query syntax can be found at many sites including https://lucene.apache.org/solr/guide/6_6/the-standard-query-parser.html
  129. solrurl = 'http://' + solr_hostname + ':' + solr_port + '/solr/' + core + '/select?q.op=OR&q=*%3A*&wt=json&sort=random_' + rand + '%20asc&rows=1'
  130. # get result
  131. request = requests.get(solrurl)
  132. # turn the API response into useful Json
  133. json = request.json()
  134. if (json['response']['numFound'] == 0):
  135. output = 'no results found'
  136. else:
  137. output = []
  138. for result in json['response']['docs']:
  139. # set ID variables
  140. id = result['id']
  141. # set content variable
  142. content = result['content']
  143. # parse result
  144. result_output = parse_result(id, content)
  145. output.append(result_output)
  146. return output
  147. def get_ten_random_elements(field):
  148. core = 'all'
  149. output = []
  150. i = 0
  151. while i <= 9:
  152. results = get_random_record(core)
  153. for result in results:
  154. if field in result:
  155. dict = {'id': result['id'], field: result[field]}
  156. output.append(dict)
  157. i += 1
  158. return output
  159. def get_ten_random_images():
  160. core = 'all'
  161. output = []
  162. i = 0
  163. while i <= 9:
  164. results = get_random_record(core)
  165. for result in results:
  166. if ops.get_images(result['doc_ref']):
  167. image = ops.get_images(result['doc_ref'])
  168. result.update(image)
  169. output.append(result)
  170. i += 1
  171. return output
  172. def get_total_number(core):
  173. # Assemble a query string to send to Solr. This uses the Solr hostname from config.env. Solr's query syntax can be found at many sites including https://lucene.apache.org/solr/guide/6_6/the-standard-query-parser.html
  174. solrurl = 'http://' + solr_hostname + ':' + solr_port + '/solr/' + core + '/select?q.op=OR&q=*:*&wt=json'
  175. # get result
  176. request = requests.get(solrurl)
  177. # turn the API response into useful Json
  178. json = request.json()
  179. num_found = json['response']['numFound']
  180. return num_found
  181. def get_term_data(field, core):
  182. # Assemble a query string to send to Solr. This uses the Solr hostname from config.env. Solr's query syntax can be found at many sites including https://lucene.apache.org/solr/guide/6_6/the-standard-query-parser.html
  183. solrurl = 'http://' + solr_hostname + ':' + solr_port + '/solr/' + core + '/terms?terms.fl=' + field + '&wt=json&terms.limit=1000&terms.sort=index'
  184. # get result
  185. request = requests.get(solrurl)
  186. # turn the API response into useful Json
  187. json = request.json()
  188. output = json['terms'][field]
  189. return output