A search interface for the Performing Patents Otherwise publication as part of the Politics of Patents case study (part of Copim WP6): this parses data from the archive of RTF files and provides additional data from the European Patent Office OPS API. https://patents.copim.ac.uk
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

solr.py 8.7KB

il y a 2 ans
il y a 2 ans
il y a 2 ans
il y a 2 ans
il y a 2 ans
il y a 2 ans
il y a 2 ans
il y a 2 ans
il y a 2 ans
il y a 2 ans
il y a 2 ans
il y a 2 ans
il y a 2 ans
il y a 2 ans
il y a 2 ans
il y a 2 ans
il y a 2 ans
il y a 2 ans
il y a 2 ans
il y a 2 ans
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248
  1. # @name: solr.py
  2. # @creation_date: 2022-09-07
  3. # @license: The MIT License <https://opensource.org/licenses/MIT>
  4. # @author: Simon Bowie <simon.bowie.19@gmail.com>
  5. # @purpose: Performs Solr functions
  6. # @acknowledgements:
  7. # pycountry module for country data: https://pypi.org/project/pycountry/
  8. import os
  9. import requests
  10. import re
  11. import urllib
  12. import random
  13. import pycountry
  14. from . import ops
  15. # get config variables from OS environment variables: set in env file passed through Docker Compose
  16. solr_hostname = os.environ.get('SOLR_HOSTNAME')
  17. solr_port = os.environ.get('SOLR_PORT')
  18. def solr_search(solrurl):
  19. # get result
  20. request = requests.get(solrurl)
  21. # turn the API response into useful Json
  22. json = request.json()
  23. num_found = json['response']['numFound']
  24. facets = []
  25. if (num_found == 0):
  26. output = 'no results found'
  27. else:
  28. output = []
  29. for result in json['response']['docs']:
  30. # set ID variable
  31. id = result['id']
  32. # set content variable
  33. content = result['content']
  34. # parse result
  35. result_output = parse_result(id, content)
  36. output.append(result_output)
  37. try:
  38. json['facet_counts']
  39. facets = json['facet_counts']['facet_fields']
  40. except KeyError:
  41. pass
  42. return output, num_found, facets
  43. def query_search(core, sort, query, country, year, page):
  44. # assemble parameters for the query string to Solr
  45. if (sort == 'relevance'):
  46. sort_parameter = ''
  47. else:
  48. sort_parameter = '&sort=' + sort
  49. if (query is None or query == 'None'):
  50. query_parameter = '&q=*%3A*'
  51. else:
  52. query_parameter = '&q=content%3A' + urllib.parse.quote_plus(query)
  53. if (country is None or country == 'None'):
  54. country_parameter = ''
  55. else:
  56. field = 'country'
  57. country_parameter = '&fq=%7B!term%20f%3D' + field + '%7D' + country
  58. if (year is None or year == 'None'):
  59. year_parameter = ''
  60. else:
  61. field = 'year'
  62. year_parameter = '&fq=%7B!term%20f%3D' + field + '%7D' + year
  63. if (page is None or page == 'None'):
  64. page_parameter = ''
  65. else:
  66. start = (int(page) * 10) - 10
  67. start = str(start)
  68. page_parameter = '&start=' + start + '&rows=10'
  69. # assemble a query string to send to Solr. This uses the Solr hostname from config.env. Solr's query syntax can be found at many sites including https://lucene.apache.org/solr/guide/6_6/the-standard-query-parser.html
  70. solrurl = 'http://' + solr_hostname + ':' + solr_port + '/solr/' + core + '/select?q.op=OR&indent=true' + query_parameter + '&wt=json' + sort_parameter + country_parameter + year_parameter + page_parameter + '&facet.field=country&facet.field=year&facet.sort=count&facet.mincount=1&facet=true'
  71. output = solr_search(solrurl)
  72. return output
  73. def id_search(core, id):
  74. # assemble a query string to send to Solr. This uses the Solr hostname from config.env. Solr's query syntax can be found at many sites including https://lucene.apache.org/solr/guide/6_6/the-standard-query-parser.html
  75. solrurl = 'http://' + solr_hostname + ':' + solr_port + '/solr/' + core + '/select?q.op=OR&q=id%3A"' + id + '"&wt=json'
  76. output = solr_search(solrurl)
  77. return output
  78. def random_search(core):
  79. rand = str(random.randint(0, 9999999))
  80. # assemble a query string to send to Solr. This uses the Solr hostname from config.env. Solr's query syntax can be found at many sites including https://lucene.apache.org/solr/guide/6_6/the-standard-query-parser.html
  81. solrurl = 'http://' + solr_hostname + ':' + solr_port + '/solr/' + core + '/select?q.op=OR&q=*%3A*&wt=json&sort=random_' + rand + '%20asc&rows=1'
  82. output = solr_search(solrurl)
  83. return output
  84. def parse_result(id, input):
  85. output = {}
  86. output['id'] = id
  87. # set document reference number (used for OPS API)
  88. doc_ref = re.search('=D\s(([^\s]*)\s([^\s]*)\s([^\s]*))', input)
  89. if doc_ref is None:
  90. doc_ref = re.search('=D&locale=en_EP\s(([^\s]*)\s([^\s]*)\s([^\s]*))', input)
  91. if doc_ref is None:
  92. output['doc_ref'] = ""
  93. else:
  94. output['doc_ref'] = doc_ref.group(1).replace(" ","")
  95. else:
  96. output['doc_ref'] = doc_ref.group(1).replace(" ","")
  97. # search for the application ID in the content element and display it
  98. application_id = re.search('Application.*\n(.*)\n', input)
  99. output['application_id'] = application_id.group(1)
  100. # search for the EPO publication URL in the content element and display it
  101. epo_publication = re.search('Publication.*\n(.*)\n', input)
  102. output['epo_publication_url'] = epo_publication.group(1)
  103. # search for the IPC publication URL in the content element and display it
  104. ipc_publication = re.search('IPC.*\n(.*)\n', input)
  105. if ipc_publication is not None:
  106. if ipc_publication.group(1) is not None:
  107. output['ipc_publication_url'] = ipc_publication.group(1)
  108. # search for the title in the content element and display it
  109. title = re.search('Title.*?\\n(.*?)\\n|Tile.?\\n(.*?)\\n', input)
  110. if title is not None:
  111. if title.group(1) is not None:
  112. output['title'] = title.group(1)
  113. else:
  114. output['title'] = title.group(2)
  115. # search for the abstract in the content element and display it
  116. abstract = re.search('Abstract.*\n(.*)\n', input)
  117. if abstract is not None:
  118. if abstract.group(1) is not None:
  119. output['abstract'] = abstract.group(1)
  120. else:
  121. abstract = re.search('\(.*?\) (\\n\\n\\n\\n|\\n\\n\\n|\\n\\n)(.*)\\n', input)
  122. if abstract is not None:
  123. if abstract.group(2) is not None:
  124. output['abstract'] = abstract.group(2)
  125. # search for the year in the content element and display it
  126. year = re.search('=D[^\s]*\s[^\s]*\s[^\s]*\s[^\s]*\s(\d{4})', input)
  127. if year is not None:
  128. output['year'] = year.group(1)
  129. # search for the country in the content element and display it
  130. country_code = re.search('FT=D[^\s]*\s(\w{2})', input)
  131. if country_code is not None:
  132. country = pycountry.countries.get(alpha_2=country_code.group(1))
  133. if country is not None:
  134. output['country'] = country
  135. else:
  136. country = pycountry.historic_countries.get(alpha_2=country_code.group(1))
  137. if country is not None:
  138. output['country'] = country
  139. else:
  140. output['country'] = country_code.group(1)
  141. output['raw'] = input
  142. return output
  143. def get_number_random_records(core, number):
  144. results_list = []
  145. i = 0
  146. while i <= number-1:
  147. search_results = random_search(core)
  148. results = search_results[0]
  149. for result in results:
  150. results_list.append(result)
  151. i += 1
  152. return results_list
  153. def get_ten_random_elements(field):
  154. core = 'all'
  155. output = []
  156. i = 0
  157. while i <= 9:
  158. search_results = random_search(core)
  159. results = search_results[0]
  160. for result in results:
  161. if field in result:
  162. dict = {'id': result['id'], field: result[field]}
  163. output.append(dict)
  164. i += 1
  165. return output
  166. def get_random_images(number):
  167. core = 'all'
  168. output = []
  169. i = 0
  170. while i <= number-1:
  171. search_results = random_search(core)
  172. results = search_results[0]
  173. for result in results:
  174. if ops.get_images(result['doc_ref']):
  175. image = ops.get_images(result['doc_ref'])
  176. dict = {'id': result['id']}
  177. dict.update(image)
  178. output.append(dict)
  179. i += 1
  180. return output
  181. def get_total_number(core):
  182. # Assemble a query string to send to Solr. This uses the Solr hostname from config.env. Solr's query syntax can be found at many sites including https://lucene.apache.org/solr/guide/6_6/the-standard-query-parser.html
  183. solrurl = 'http://' + solr_hostname + ':' + solr_port + '/solr/' + core + '/select?q.op=OR&q=*:*&wt=json'
  184. # get result
  185. request = requests.get(solrurl)
  186. # turn the API response into useful Json
  187. json = request.json()
  188. num_found = json['response']['numFound']
  189. return num_found
  190. def get_term_data(field, core):
  191. # Assemble a query string to send to Solr. This uses the Solr hostname from config.env. Solr's query syntax can be found at many sites including https://lucene.apache.org/solr/guide/6_6/the-standard-query-parser.html
  192. solrurl = 'http://' + solr_hostname + ':' + solr_port + '/solr/' + core + '/terms?terms.fl=' + field + '&wt=json&terms.limit=1000&terms.sort=index'
  193. # get result
  194. request = requests.get(solrurl)
  195. # turn the API response into useful Json
  196. json = request.json()
  197. output = json['terms'][field]
  198. return output