A search interface for the Performing Patents Otherwise publication as part of the Politics of Patents case study (part of Copim WP6): this parses data from the archive of RTF files and provides additional data from the European Patent Office OPS API. https://patents.copim.ac.uk
Ви не можете вибрати більше 25 тем Теми мають розпочинатися з літери або цифри, можуть містити дефіси (-) і не повинні перевищувати 35 символів.

2 роки тому
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145
  1. # @name: solr.py
  2. # @version: 0.1
  3. # @creation_date: 2022-09-07
  4. # @license: The MIT License <https://opensource.org/licenses/MIT>
  5. # @author: Simon Bowie <simon.bowie.19@gmail.com>
  6. # @purpose: Performs Solr functions
  7. # @acknowledgements:
  8. import os
  9. import requests
  10. import re
  11. import urllib
  12. import random
  13. from . import ops
  14. # get config variables from OS environment variables: set in env file passed through Docker Compose
  15. solr_hostname = os.environ.get('SOLR_HOSTNAME')
  16. solr_port = os.environ.get('SOLR_PORT')
  17. def solr_search(core, sort, search=None, id=None):
  18. # Assemble a query string to send to Solr. This uses the Solr hostname from config.env. Solr's query syntax can be found at many sites including https://lucene.apache.org/solr/guide/6_6/the-standard-query-parser.html
  19. if id is not None:
  20. solrurl = 'http://' + solr_hostname + ':' + solr_port + '/solr/' + core + '/select?q.op=OR&q=id%3A"' + id + '"&wt=json'
  21. else:
  22. if (sort == 'relevance'):
  23. solrurl = 'http://' + solr_hostname + ':' + solr_port + '/solr/' + core + '/select?q.op=OR&q=content%3A' + urllib.parse.quote_plus(search) + '&wt=json'
  24. else:
  25. solrurl = 'http://' + solr_hostname + ':' + solr_port + '/solr/' + core + '/select?q.op=OR&q=content%3A' + urllib.parse.quote_plus(search) + '&wt=json&sort=' + sort
  26. # get result
  27. request = requests.get(solrurl)
  28. # turn the API response into useful Json
  29. json = request.json()
  30. if (json['response']['numFound'] == 0):
  31. output = 'no results found'
  32. else:
  33. output = []
  34. for result in json['response']['docs']:
  35. # set ID variable
  36. id = result['id']
  37. # set content variable
  38. content = result['content']
  39. # parse result
  40. result_output = parse_result(id, content)
  41. output.append(result_output)
  42. return output
  43. def parse_result(id, input):
  44. output = {}
  45. output['id'] = id
  46. # set document reference number (used for OPS API)
  47. doc_ref = re.search('=D\s(([^\s]*)\s([^\s]*)\s([^\s]*))', input)
  48. if doc_ref is None:
  49. doc_ref = re.search('=D&locale=en_EP\s(([^\s]*)\s([^\s]*)\s([^\s]*))', input)
  50. output['doc_ref'] = doc_ref.group(1).replace(" ","")
  51. else:
  52. output['doc_ref'] = doc_ref.group(1).replace(" ","")
  53. # search for the application ID in the content element and display it
  54. application_id = re.search('Application.*\n(.*)\n', input)
  55. output['application_id'] = application_id.group(1)
  56. # search for the EPO publication URL in the content element and display it
  57. epo_publication = re.search('Publication.*\n(.*)\n', input)
  58. output['epo_publication_url'] = epo_publication.group(1)
  59. # search for the IPC publication URL in the content element and display it
  60. ipc_publication = re.search('IPC.*\n(.*)\n', input)
  61. output['ipc_publication_url'] = ipc_publication.group(1)
  62. # search for the title in the content element and display it
  63. title = re.search('Title.*\n(.*)\n', input)
  64. if title is not None:
  65. output['title'] = title.group(1)
  66. # search for the abstract in the content element and display it
  67. abstract = re.search('Abstract.*\n(.*)\n', input)
  68. if abstract is None:
  69. abstract = re.search('\(.\) \\n\\n(.*)\\n', input)
  70. if abstract is not None:
  71. output['abstract'] = abstract.group(1);
  72. # search for the year in the content element and display it
  73. year = re.search('=D[^\s]*\s[^\s]*\s[^\s]*\s[^\s]*\s(\d{4})', input)
  74. if year is not None:
  75. output['year'] = year.group(1)
  76. return output
  77. def get_random_record(core):
  78. rand = str(random.randint(0, 9999999))
  79. # Assemble a query string to send to Solr. This uses the Solr hostname from config.env. Solr's query syntax can be found at many sites including https://lucene.apache.org/solr/guide/6_6/the-standard-query-parser.html
  80. solrurl = 'http://' + solr_hostname + ':' + solr_port + '/solr/' + core + '/select?q.op=OR&q=*%3A*&wt=json&sort=random_' + rand + '%20asc&rows=1'
  81. # get result
  82. request = requests.get(solrurl)
  83. # turn the API response into useful Json
  84. json = request.json()
  85. if (json['response']['numFound'] == 0):
  86. output = 'no results found'
  87. else:
  88. output = []
  89. for result in json['response']['docs']:
  90. # set ID variables
  91. id = result['id']
  92. # set content variable
  93. content = result['content']
  94. # parse result
  95. result_output = parse_result(id, content)
  96. output.append(result_output)
  97. return output
  98. def get_ten_random_elements(field):
  99. core = 'all'
  100. output = []
  101. i = 0
  102. while i <= 9:
  103. results = get_random_record(core)
  104. for result in results:
  105. if field in result:
  106. dict = {'id': result['id'], field: result[field]}
  107. output.append(dict)
  108. i += 1
  109. return output
  110. def get_ten_random_images():
  111. core = 'all'
  112. output = []
  113. i = 0
  114. while i <= 9:
  115. results = get_random_record(core)
  116. for result in results:
  117. if ops.get_images(result['doc_ref']):
  118. image = ops.get_images(result['doc_ref'])
  119. result.update(image)
  120. output.append(result)
  121. i += 1
  122. return output