A search interface for the Performing Patents Otherwise publication as part of the Politics of Patents case study (part of Copim WP6): this parses data from the archive of RTF files and provides additional data from the European Patent Office OPS API. https://patents.copim.ac.uk

38 Zeilen
1.3KB

  1. # import required modules
  2. import os
  3. from striprtf.striprtf import rtf_to_text
  4. # assign directory
  5. directory = 'data/POP_Dataset_2022'
  6. total = 0
  7. # iterate over files in
  8. # that directory
  9. for root, dirs, files in os.walk(directory):
  10. for filename in files:
  11. if '.rtf' in filename:
  12. file = os.path.join(root, filename)
  13. file = open(file, "rt")
  14. content = file.read()
  15. #text = rtf_to_text(content)
  16. words = content.split()
  17. substring = '\\'
  18. # remove elements from list that contain given string
  19. words = [item for item in words if substring not in item]
  20. substring = '}'
  21. # remove elements from list that contain given string
  22. words = [item for item in words if substring not in item]
  23. substring = '{'
  24. # remove elements from list that contain given string
  25. words = [item for item in words if substring not in item]
  26. substring = '/'
  27. # remove elements from list that contain given string
  28. words = [item for item in words if substring not in item]
  29. substring = '('
  30. # remove elements from list that contain given string
  31. words = [item for item in words if substring not in item]
  32. total += len(words)
  33. #print(words)
  34. print(total)