|
-
- import os
- from striprtf.striprtf import rtf_to_text
-
-
- directory = 'data/POP_Dataset_2022'
- total = 0
-
-
-
- for root, dirs, files in os.walk(directory):
- for filename in files:
- if '.rtf' in filename:
- file = os.path.join(root, filename)
- file = open(file, "rt")
- content = file.read()
-
- words = content.split()
- substring = '\\'
-
- words = [item for item in words if substring not in item]
- substring = '}'
-
- words = [item for item in words if substring not in item]
- substring = '{'
-
- words = [item for item in words if substring not in item]
- substring = '/'
-
- words = [item for item in words if substring not in item]
- substring = '('
-
- words = [item for item in words if substring not in item]
- total += len(words)
-
-
- print(total)
|