|
- # import required modules
- import os
- from striprtf.striprtf import rtf_to_text
-
- # assign directory
- directory = 'data/POP_Dataset_2022'
- total = 0
-
- # iterate over files in
- # that directory
- for root, dirs, files in os.walk(directory):
- for filename in files:
- if '.rtf' in filename:
- file = os.path.join(root, filename)
- file = open(file, "rt")
- content = file.read()
- #text = rtf_to_text(content)
- words = content.split()
- substring = '\\'
- # remove elements from list that contain given string
- words = [item for item in words if substring not in item]
- substring = '}'
- # remove elements from list that contain given string
- words = [item for item in words if substring not in item]
- substring = '{'
- # remove elements from list that contain given string
- words = [item for item in words if substring not in item]
- substring = '/'
- # remove elements from list that contain given string
- words = [item for item in words if substring not in item]
- substring = '('
- # remove elements from list that contain given string
- words = [item for item in words if substring not in item]
- total += len(words)
- #print(words)
-
- print(total)
|