A search interface for data from the Politics of Patents case study (part of Copim WP6): this parses data from the archive of RTF files and provides additional data from the European Patent Office API. https://patents.copim.ac.uk
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

116 lines
2.8KB

  1. function get_class(name) {
  2. var clazz;
  3. try {
  4. // Java8 Nashorn
  5. clazz = eval("Java.type(name).class");
  6. } catch(e) {
  7. // Java7 Rhino
  8. clazz = eval("Packages."+name);
  9. }
  10. return clazz;
  11. }
  12. function processAdd(cmd) {
  13. doc = cmd.solrDoc; // org.apache.solr.common.SolrInputDocument
  14. var id = doc.getFieldValue("id");
  15. logger.info("update-script#processAdd: id=" + id);
  16. // The idea here is to use the file's content_type value to
  17. // simplify into user-friendly values, such that types of, say, image/jpeg and image/tiff
  18. // are in an "Images" facet
  19. var ct = doc.getFieldValue("content_type");
  20. if (ct) {
  21. // strip off semicolon onward
  22. var semicolon_index = ct.indexOf(';');
  23. if (semicolon_index != -1) {
  24. ct = ct.substring(0,semicolon_index);
  25. }
  26. // and split type/subtype
  27. var ct_type = ct.substring(0,ct.indexOf('/'));
  28. var ct_subtype = ct.substring(ct.indexOf('/')+1);
  29. var doc_type;
  30. switch(true) {
  31. case /^application\/rtf/.test(ct) || /wordprocessing/.test(ct):
  32. doc_type = "doc";
  33. break;
  34. case /html/.test(ct):
  35. doc_type = "html";
  36. break;
  37. case /^image\/.*/.test(ct):
  38. doc_type = "image";
  39. break;
  40. case /presentation|powerpoint/.test(ct):
  41. doc_type = "presentation";
  42. break;
  43. case /spreadsheet|excel/.test(ct):
  44. doc_type = "spreadsheet";
  45. break;
  46. case /^application\/pdf/.test(ct):
  47. doc_type = "pdf";
  48. break;
  49. case /^text\/plain/.test(ct):
  50. doc_type = "text"
  51. break;
  52. default:
  53. break;
  54. }
  55. // TODO: error handling needed? What if there is no slash?
  56. if(doc_type) { doc.setField("doc_type", doc_type); }
  57. doc.setField("content_type_type_s", ct_type);
  58. doc.setField("content_type_subtype_s", ct_subtype);
  59. }
  60. var content = doc.getFieldValue("content");
  61. if (!content) {
  62. return; //No content found, so we are done here
  63. }
  64. var analyzer =
  65. req.getCore().getLatestSchema()
  66. .getFieldTypeByName("text_email_url")
  67. .getIndexAnalyzer();
  68. var token_stream =
  69. analyzer.tokenStream("content", content);
  70. var term_att = token_stream.getAttribute(get_class("org.apache.lucene.analysis.tokenattributes.CharTermAttribute"));
  71. var type_att = token_stream.getAttribute(get_class("org.apache.lucene.analysis.tokenattributes.TypeAttribute"));
  72. token_stream.reset();
  73. while (token_stream.incrementToken()) {
  74. doc.addField(type_att.type().replace(/\<|\>/g,'').toLowerCase()+"_ss", term_att.toString());
  75. }
  76. token_stream.end();
  77. token_stream.close();
  78. }
  79. function processDelete(cmd) {
  80. // no-op
  81. }
  82. function processMergeIndexes(cmd) {
  83. // no-op
  84. }
  85. function processCommit(cmd) {
  86. // no-op
  87. }
  88. function processRollback(cmd) {
  89. // no-op
  90. }
  91. function finish() {
  92. // no-op
  93. }