A search interface for the Performing Patents Otherwise publication as part of the Politics of Patents case study (part of Copim WP6): this parses data from the archive of RTF files and provides additional data from the European Patent Office OPS API. https://patents.copim.ac.uk
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

solr.py 8.1KB

2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221
  1. # @name: solr.py
  2. # @creation_date: 2022-09-07
  3. # @license: The MIT License <https://opensource.org/licenses/MIT>
  4. # @author: Simon Bowie <simon.bowie.19@gmail.com>
  5. # @purpose: Performs Solr functions
  6. # @acknowledgements:
  7. import os
  8. import requests
  9. import re
  10. import urllib
  11. import random
  12. import pycountry
  13. from . import ops
  14. # get config variables from OS environment variables: set in env file passed through Docker Compose
  15. solr_hostname = os.environ.get('SOLR_HOSTNAME')
  16. solr_port = os.environ.get('SOLR_PORT')
  17. def content_search(core, sort, search=None, id=None):
  18. # Assemble a query string to send to Solr. This uses the Solr hostname from config.env. Solr's query syntax can be found at many sites including https://lucene.apache.org/solr/guide/6_6/the-standard-query-parser.html
  19. if id is not None:
  20. solrurl = 'http://' + solr_hostname + ':' + solr_port + '/solr/' + core + '/select?q.op=OR&q=id%3A"' + id + '"&wt=json'
  21. else:
  22. if (sort == 'relevance'):
  23. solrurl = 'http://' + solr_hostname + ':' + solr_port + '/solr/' + core + '/select?q.op=OR&q=content%3A' + urllib.parse.quote_plus(search) + '&wt=json'
  24. else:
  25. solrurl = 'http://' + solr_hostname + ':' + solr_port + '/solr/' + core + '/select?q.op=OR&q=content%3A' + urllib.parse.quote_plus(search) + '&wt=json&sort=' + sort
  26. # get result
  27. request = requests.get(solrurl)
  28. # turn the API response into useful Json
  29. json = request.json()
  30. num_found = json['response']['numFound']
  31. if (num_found == 0):
  32. output = 'no results found'
  33. else:
  34. output = []
  35. for result in json['response']['docs']:
  36. # set ID variable
  37. id = result['id']
  38. # set content variable
  39. content = result['content']
  40. # parse result
  41. result_output = parse_result(id, content)
  42. output.append(result_output)
  43. return output, num_found
  44. def term_search(core, sort, field, input):
  45. # Assemble a query string to send to Solr. This uses the Solr hostname from config.env. Solr's query syntax can be found at many sites including https://lucene.apache.org/solr/guide/6_6/the-standard-query-parser.html
  46. if (sort == 'relevance'):
  47. solrurl = 'http://' + solr_hostname + ':' + solr_port + '/solr/' + core + '/select?q.op=OR&q=%7B!term%20f%3D' + field + '%7D' + input + '&wt=json'
  48. else:
  49. solrurl = 'http://' + solr_hostname + ':' + solr_port + '/solr/' + core + '/select?q.op=OR&q=%7B!term%20f%3D' + field + '%7D' + input + '&wt=json&sort=' + sort
  50. # get result
  51. request = requests.get(solrurl)
  52. # turn the API response into useful Json
  53. json = request.json()
  54. num_found = json['response']['numFound']
  55. if (num_found == 0):
  56. output = 'no results found'
  57. else:
  58. output = []
  59. for result in json['response']['docs']:
  60. # set ID variable
  61. id = result['id']
  62. # set content variable
  63. content = result['content']
  64. # parse result
  65. result_output = parse_result(id, content)
  66. output.append(result_output)
  67. return output, num_found
  68. def parse_result(id, input):
  69. output = {}
  70. output['id'] = id
  71. # set document reference number (used for OPS API)
  72. doc_ref = re.search('=D\s(([^\s]*)\s([^\s]*)\s([^\s]*))', input)
  73. if doc_ref is None:
  74. doc_ref = re.search('=D&locale=en_EP\s(([^\s]*)\s([^\s]*)\s([^\s]*))', input)
  75. if doc_ref is None:
  76. output['doc_ref'] = ""
  77. else:
  78. output['doc_ref'] = doc_ref.group(1).replace(" ","")
  79. else:
  80. output['doc_ref'] = doc_ref.group(1).replace(" ","")
  81. # search for the application ID in the content element and display it
  82. application_id = re.search('Application.*\n(.*)\n', input)
  83. output['application_id'] = application_id.group(1)
  84. # search for the EPO publication URL in the content element and display it
  85. epo_publication = re.search('Publication.*\n(.*)\n', input)
  86. output['epo_publication_url'] = epo_publication.group(1)
  87. # search for the IPC publication URL in the content element and display it
  88. ipc_publication = re.search('IPC.*\n(.*)\n', input)
  89. output['ipc_publication_url'] = ipc_publication.group(1)
  90. # search for the title in the content element and display it
  91. title = re.search('Title.*\n(.*)\n', input)
  92. if title is not None:
  93. output['title'] = title.group(1)
  94. # search for the abstract in the content element and display it
  95. abstract = re.search('Abstract.*\n(.*)\n', input)
  96. if abstract is None:
  97. abstract = re.search('\(.\) \\n\\n(.*)\\n', input)
  98. if abstract is not None:
  99. output['abstract'] = abstract.group(1);
  100. # search for the year in the content element and display it
  101. year = re.search('=D[^\s]*\s[^\s]*\s[^\s]*\s[^\s]*\s(\d{4})', input)
  102. if year is not None:
  103. output['year'] = year.group(1)
  104. # search for the country in the content element and display it
  105. country_code = re.search('FT=D[^\s]*\s(\w{2})', input)
  106. country = pycountry.countries.get(alpha_2=country_code.group(1))
  107. if country is not None:
  108. output['country'] = country
  109. else:
  110. country = pycountry.historic_countries.get(alpha_2=country_code.group(1))
  111. if country is not None:
  112. output['country'] = country
  113. else:
  114. output['country'] = country_code.group(1)
  115. return output
  116. def get_random_record(core):
  117. rand = str(random.randint(0, 9999999))
  118. # Assemble a query string to send to Solr. This uses the Solr hostname from config.env. Solr's query syntax can be found at many sites including https://lucene.apache.org/solr/guide/6_6/the-standard-query-parser.html
  119. solrurl = 'http://' + solr_hostname + ':' + solr_port + '/solr/' + core + '/select?q.op=OR&q=*%3A*&wt=json&sort=random_' + rand + '%20asc&rows=1'
  120. # get result
  121. request = requests.get(solrurl)
  122. # turn the API response into useful Json
  123. json = request.json()
  124. if (json['response']['numFound'] == 0):
  125. output = 'no results found'
  126. else:
  127. output = []
  128. for result in json['response']['docs']:
  129. # set ID variables
  130. id = result['id']
  131. # set content variable
  132. content = result['content']
  133. # parse result
  134. result_output = parse_result(id, content)
  135. output.append(result_output)
  136. return output
  137. def get_ten_random_elements(field):
  138. core = 'all'
  139. output = []
  140. i = 0
  141. while i <= 9:
  142. results = get_random_record(core)
  143. for result in results:
  144. if field in result:
  145. dict = {'id': result['id'], field: result[field]}
  146. output.append(dict)
  147. i += 1
  148. return output
  149. def get_ten_random_images():
  150. core = 'all'
  151. output = []
  152. i = 0
  153. while i <= 9:
  154. results = get_random_record(core)
  155. for result in results:
  156. if ops.get_images(result['doc_ref']):
  157. image = ops.get_images(result['doc_ref'])
  158. result.update(image)
  159. output.append(result)
  160. i += 1
  161. return output
  162. def get_total_number(core):
  163. # Assemble a query string to send to Solr. This uses the Solr hostname from config.env. Solr's query syntax can be found at many sites including https://lucene.apache.org/solr/guide/6_6/the-standard-query-parser.html
  164. solrurl = 'http://' + solr_hostname + ':' + solr_port + '/solr/' + core + '/select?q.op=OR&q=*:*&wt=json'
  165. # get result
  166. request = requests.get(solrurl)
  167. # turn the API response into useful Json
  168. json = request.json()
  169. num_found = json['response']['numFound']
  170. return num_found
  171. def get_term_data(field, core):
  172. # Assemble a query string to send to Solr. This uses the Solr hostname from config.env. Solr's query syntax can be found at many sites including https://lucene.apache.org/solr/guide/6_6/the-standard-query-parser.html
  173. solrurl = 'http://' + solr_hostname + ':' + solr_port + '/solr/' + core + '/terms?terms.fl=' + field + '&wt=json&terms.limit=1000&terms.sort=index'
  174. # get result
  175. request = requests.get(solrurl)
  176. # turn the API response into useful Json
  177. json = request.json()
  178. output = json['terms'][field]
  179. return output