Text Duplicate filter in Python 3.7
up vote
5
down vote
favorite
I wrote this program just to remove duplicates from wordlist used for brute-force, but it can be used for every txt file i guess.
It is a pretty long program for what it does, and I'm sure there is a way I can make it easier and better for everyone to look at.
I'm still a beginner in python.
def wordcount_initial(): # count the numbre of words in the input file
global num_words_initial
with open(file_initial, 'r') as f:
for line in f:
words = line.split()
num_words_initial += len(words)
def dupfilter():
content = open(file_initial, "r").readlines()
content_set = set(content) # filter duplicates
cleandata = open(file_final, "w+") # write the text filtered in the new file
for line in content_set:
cleandata.write(line)
def wordcount_final(): # count number of words in the new file
global num_words_final
with open(file_final, 'r') as f:
for line in f:
words = line.split()
num_words_final += len(words)
if __name__ == '__main__':
num_words = 0
num_words_initial = 0
num_words_final = 0
ready = False
while not ready:
file_initial = input("What is the name of the text?")
file_final = input("How do you want to name the filted file?")
if file_initial and file_final != "":
wordcount_initial()
ready = True
dupfilter()
wordcount_final()
print("Number of duplicates filtered:" + str(num_words_initial - num_words_final))
input("nPress <ENTER> to quit the program.")
python beginner python-3.x file
add a comment |
up vote
5
down vote
favorite
I wrote this program just to remove duplicates from wordlist used for brute-force, but it can be used for every txt file i guess.
It is a pretty long program for what it does, and I'm sure there is a way I can make it easier and better for everyone to look at.
I'm still a beginner in python.
def wordcount_initial(): # count the numbre of words in the input file
global num_words_initial
with open(file_initial, 'r') as f:
for line in f:
words = line.split()
num_words_initial += len(words)
def dupfilter():
content = open(file_initial, "r").readlines()
content_set = set(content) # filter duplicates
cleandata = open(file_final, "w+") # write the text filtered in the new file
for line in content_set:
cleandata.write(line)
def wordcount_final(): # count number of words in the new file
global num_words_final
with open(file_final, 'r') as f:
for line in f:
words = line.split()
num_words_final += len(words)
if __name__ == '__main__':
num_words = 0
num_words_initial = 0
num_words_final = 0
ready = False
while not ready:
file_initial = input("What is the name of the text?")
file_final = input("How do you want to name the filted file?")
if file_initial and file_final != "":
wordcount_initial()
ready = True
dupfilter()
wordcount_final()
print("Number of duplicates filtered:" + str(num_words_initial - num_words_final))
input("nPress <ENTER> to quit the program.")
python beginner python-3.x file
add a comment |
up vote
5
down vote
favorite
up vote
5
down vote
favorite
I wrote this program just to remove duplicates from wordlist used for brute-force, but it can be used for every txt file i guess.
It is a pretty long program for what it does, and I'm sure there is a way I can make it easier and better for everyone to look at.
I'm still a beginner in python.
def wordcount_initial(): # count the numbre of words in the input file
global num_words_initial
with open(file_initial, 'r') as f:
for line in f:
words = line.split()
num_words_initial += len(words)
def dupfilter():
content = open(file_initial, "r").readlines()
content_set = set(content) # filter duplicates
cleandata = open(file_final, "w+") # write the text filtered in the new file
for line in content_set:
cleandata.write(line)
def wordcount_final(): # count number of words in the new file
global num_words_final
with open(file_final, 'r') as f:
for line in f:
words = line.split()
num_words_final += len(words)
if __name__ == '__main__':
num_words = 0
num_words_initial = 0
num_words_final = 0
ready = False
while not ready:
file_initial = input("What is the name of the text?")
file_final = input("How do you want to name the filted file?")
if file_initial and file_final != "":
wordcount_initial()
ready = True
dupfilter()
wordcount_final()
print("Number of duplicates filtered:" + str(num_words_initial - num_words_final))
input("nPress <ENTER> to quit the program.")
python beginner python-3.x file
I wrote this program just to remove duplicates from wordlist used for brute-force, but it can be used for every txt file i guess.
It is a pretty long program for what it does, and I'm sure there is a way I can make it easier and better for everyone to look at.
I'm still a beginner in python.
def wordcount_initial(): # count the numbre of words in the input file
global num_words_initial
with open(file_initial, 'r') as f:
for line in f:
words = line.split()
num_words_initial += len(words)
def dupfilter():
content = open(file_initial, "r").readlines()
content_set = set(content) # filter duplicates
cleandata = open(file_final, "w+") # write the text filtered in the new file
for line in content_set:
cleandata.write(line)
def wordcount_final(): # count number of words in the new file
global num_words_final
with open(file_final, 'r') as f:
for line in f:
words = line.split()
num_words_final += len(words)
if __name__ == '__main__':
num_words = 0
num_words_initial = 0
num_words_final = 0
ready = False
while not ready:
file_initial = input("What is the name of the text?")
file_final = input("How do you want to name the filted file?")
if file_initial and file_final != "":
wordcount_initial()
ready = True
dupfilter()
wordcount_final()
print("Number of duplicates filtered:" + str(num_words_initial - num_words_final))
input("nPress <ENTER> to quit the program.")
python beginner python-3.x file
python beginner python-3.x file
edited 10 hours ago
200_success
127k15148411
127k15148411
asked 12 hours ago
Thewizy
426
426
add a comment |
add a comment |
2 Answers
2
active
oldest
votes
up vote
6
down vote
accepted
You should not use global variables unless you really need to. Instead, pass relevant parameters as arguments to the functions and return the results. This makes them actually reusable. Currently you have two functions to count the initial and final number of words, when you could just have a word_count
function:
def wordcount(file_name):
"""count the number of words in the file"""
with open(file_name) as f:
return sum(len(line.split()) for line in f)
def dupfilter(file_initial, file_final):
with open(file_initial) as in_file, open(file_final, "w") as out_file:
out_file.writelines(set(in_file.readlines()))
if __name__ == '__main__':
while True:
file_initial = input("What is the name of the text?")
file_final = input("How do you want to name the filtered file?")
if file_initial and file_final and file_initial != file_final:
break
num_words_initial = wordcount(file_initial)
dupfilter(file_initial, file_final)
num_words_final = wordcount(file_final)
print("Number of duplicates filtered:", num_words_initial - num_words_final)
input("nPress <ENTER> to quit the program.")
I also used sum
with a generator expression to simplify the wordcount
function, used with
to ensure that files are properly closed.
In addition, I replaced the while not ready
loop with a while True
loop and an explicit break
. This is much more common in Python, especially for user input.
Note that if file_initial and file_final != ""
is only incidentally the same as if file_initial != "" and file_final != ""
because non-empty strings are truthy. This is why it is also equivalent to if file_initial and file_final
. But for example x and y == 3
is not the same as x == 3 and y == 3
.
You also don't need to call str
on things to be printed, print
does that for you if necessary.
Note that using set
does not guarantee the same order as the original file, for that you would want to use the itertools
recipe unique_everseen
:
from itertools import filterfalse
def unique_everseen(iterable, key=None):
"List unique elements, preserving order. Remember all elements ever seen."
# unique_everseen('AAAABBBCCDAABBB') --> A B C D
# unique_everseen('ABBCcAD', str.lower) --> A B C D
seen = set()
seen_add = seen.add
if key is None:
for element in filterfalse(seen.__contains__, iterable):
seen_add(element)
yield element
else:
for element in iterable:
k = key(element)
if k not in seen:
seen_add(k)
yield element
def dupfilter(file_initial, file_final):
with open(file_initial) as in_file, open(file_final, "w") as out_file:
out_file.writelines(unique_everseen(in_file))
1
Nice section on itertools instead of set!
– Julien Rousé
9 hours ago
Thanks a lot your help is really precious!
– Thewizy
7 hours ago
1
I didn't know that set could not give the same order when filtering the duplicate thanks!
– Thewizy
7 hours ago
add a comment |
up vote
4
down vote
Thanks for sharing your code!
I'll begin by pointing out you don't need global
variable, and you should avoid them when possible. Very often it only does make your code less readable and more fragile.
Here your function wordcount_initial
could be rewritten as: (same idea for wordcount_final
)
def wordcount_initial(input_file):
"""Return the number of words in the file given in parameter
:param input_file: A file name
:type input_file: string
:return: The number of words in the file `input_file`
:rtype: int
"""
num_words_initial = 0
with open(input_file, 'r') as f:
for line in f:
words = line.split()
num_words_initial += len(words)
return num_words_initial
There are a few changes here:
- Removed
num_words_initial
as a global and return it's value at the end of the function. It's much more clean to return value that way when you can. It also help when you want to test your functions. - Gives
input_file
as a parameter of your function instead of relying on another global. It makes your function more reusable. - And I transformed your comment in docstring that can be used to generate documentation for your code. See ReStrusctured and sphinx for more information. Ideally every function should be documented (and every module too)
Another remark, you name your function dupfilter
, but every other name in your code has the format first_last
which is a bit inconsistant. Also, don't try to gain a few letters when typing, write duplicatefilter
or better (in my opinion) filter_duplicate
.
Naming is always a bit subjective, use your best judgement.
And finally in your __main__
you could have put the logic for initialising the name of the file into another function but that's not very important.
On a more positive note I like how you laid out your code, it is well spaced, most of the name are clear when you read them and you have comment which is often important.
Nice job!
Thank a lot for your help i will try to apply all thoses advice to my futur programs!
– Thewizy
7 hours ago
add a comment |
2 Answers
2
active
oldest
votes
2 Answers
2
active
oldest
votes
active
oldest
votes
active
oldest
votes
up vote
6
down vote
accepted
You should not use global variables unless you really need to. Instead, pass relevant parameters as arguments to the functions and return the results. This makes them actually reusable. Currently you have two functions to count the initial and final number of words, when you could just have a word_count
function:
def wordcount(file_name):
"""count the number of words in the file"""
with open(file_name) as f:
return sum(len(line.split()) for line in f)
def dupfilter(file_initial, file_final):
with open(file_initial) as in_file, open(file_final, "w") as out_file:
out_file.writelines(set(in_file.readlines()))
if __name__ == '__main__':
while True:
file_initial = input("What is the name of the text?")
file_final = input("How do you want to name the filtered file?")
if file_initial and file_final and file_initial != file_final:
break
num_words_initial = wordcount(file_initial)
dupfilter(file_initial, file_final)
num_words_final = wordcount(file_final)
print("Number of duplicates filtered:", num_words_initial - num_words_final)
input("nPress <ENTER> to quit the program.")
I also used sum
with a generator expression to simplify the wordcount
function, used with
to ensure that files are properly closed.
In addition, I replaced the while not ready
loop with a while True
loop and an explicit break
. This is much more common in Python, especially for user input.
Note that if file_initial and file_final != ""
is only incidentally the same as if file_initial != "" and file_final != ""
because non-empty strings are truthy. This is why it is also equivalent to if file_initial and file_final
. But for example x and y == 3
is not the same as x == 3 and y == 3
.
You also don't need to call str
on things to be printed, print
does that for you if necessary.
Note that using set
does not guarantee the same order as the original file, for that you would want to use the itertools
recipe unique_everseen
:
from itertools import filterfalse
def unique_everseen(iterable, key=None):
"List unique elements, preserving order. Remember all elements ever seen."
# unique_everseen('AAAABBBCCDAABBB') --> A B C D
# unique_everseen('ABBCcAD', str.lower) --> A B C D
seen = set()
seen_add = seen.add
if key is None:
for element in filterfalse(seen.__contains__, iterable):
seen_add(element)
yield element
else:
for element in iterable:
k = key(element)
if k not in seen:
seen_add(k)
yield element
def dupfilter(file_initial, file_final):
with open(file_initial) as in_file, open(file_final, "w") as out_file:
out_file.writelines(unique_everseen(in_file))
1
Nice section on itertools instead of set!
– Julien Rousé
9 hours ago
Thanks a lot your help is really precious!
– Thewizy
7 hours ago
1
I didn't know that set could not give the same order when filtering the duplicate thanks!
– Thewizy
7 hours ago
add a comment |
up vote
6
down vote
accepted
You should not use global variables unless you really need to. Instead, pass relevant parameters as arguments to the functions and return the results. This makes them actually reusable. Currently you have two functions to count the initial and final number of words, when you could just have a word_count
function:
def wordcount(file_name):
"""count the number of words in the file"""
with open(file_name) as f:
return sum(len(line.split()) for line in f)
def dupfilter(file_initial, file_final):
with open(file_initial) as in_file, open(file_final, "w") as out_file:
out_file.writelines(set(in_file.readlines()))
if __name__ == '__main__':
while True:
file_initial = input("What is the name of the text?")
file_final = input("How do you want to name the filtered file?")
if file_initial and file_final and file_initial != file_final:
break
num_words_initial = wordcount(file_initial)
dupfilter(file_initial, file_final)
num_words_final = wordcount(file_final)
print("Number of duplicates filtered:", num_words_initial - num_words_final)
input("nPress <ENTER> to quit the program.")
I also used sum
with a generator expression to simplify the wordcount
function, used with
to ensure that files are properly closed.
In addition, I replaced the while not ready
loop with a while True
loop and an explicit break
. This is much more common in Python, especially for user input.
Note that if file_initial and file_final != ""
is only incidentally the same as if file_initial != "" and file_final != ""
because non-empty strings are truthy. This is why it is also equivalent to if file_initial and file_final
. But for example x and y == 3
is not the same as x == 3 and y == 3
.
You also don't need to call str
on things to be printed, print
does that for you if necessary.
Note that using set
does not guarantee the same order as the original file, for that you would want to use the itertools
recipe unique_everseen
:
from itertools import filterfalse
def unique_everseen(iterable, key=None):
"List unique elements, preserving order. Remember all elements ever seen."
# unique_everseen('AAAABBBCCDAABBB') --> A B C D
# unique_everseen('ABBCcAD', str.lower) --> A B C D
seen = set()
seen_add = seen.add
if key is None:
for element in filterfalse(seen.__contains__, iterable):
seen_add(element)
yield element
else:
for element in iterable:
k = key(element)
if k not in seen:
seen_add(k)
yield element
def dupfilter(file_initial, file_final):
with open(file_initial) as in_file, open(file_final, "w") as out_file:
out_file.writelines(unique_everseen(in_file))
1
Nice section on itertools instead of set!
– Julien Rousé
9 hours ago
Thanks a lot your help is really precious!
– Thewizy
7 hours ago
1
I didn't know that set could not give the same order when filtering the duplicate thanks!
– Thewizy
7 hours ago
add a comment |
up vote
6
down vote
accepted
up vote
6
down vote
accepted
You should not use global variables unless you really need to. Instead, pass relevant parameters as arguments to the functions and return the results. This makes them actually reusable. Currently you have two functions to count the initial and final number of words, when you could just have a word_count
function:
def wordcount(file_name):
"""count the number of words in the file"""
with open(file_name) as f:
return sum(len(line.split()) for line in f)
def dupfilter(file_initial, file_final):
with open(file_initial) as in_file, open(file_final, "w") as out_file:
out_file.writelines(set(in_file.readlines()))
if __name__ == '__main__':
while True:
file_initial = input("What is the name of the text?")
file_final = input("How do you want to name the filtered file?")
if file_initial and file_final and file_initial != file_final:
break
num_words_initial = wordcount(file_initial)
dupfilter(file_initial, file_final)
num_words_final = wordcount(file_final)
print("Number of duplicates filtered:", num_words_initial - num_words_final)
input("nPress <ENTER> to quit the program.")
I also used sum
with a generator expression to simplify the wordcount
function, used with
to ensure that files are properly closed.
In addition, I replaced the while not ready
loop with a while True
loop and an explicit break
. This is much more common in Python, especially for user input.
Note that if file_initial and file_final != ""
is only incidentally the same as if file_initial != "" and file_final != ""
because non-empty strings are truthy. This is why it is also equivalent to if file_initial and file_final
. But for example x and y == 3
is not the same as x == 3 and y == 3
.
You also don't need to call str
on things to be printed, print
does that for you if necessary.
Note that using set
does not guarantee the same order as the original file, for that you would want to use the itertools
recipe unique_everseen
:
from itertools import filterfalse
def unique_everseen(iterable, key=None):
"List unique elements, preserving order. Remember all elements ever seen."
# unique_everseen('AAAABBBCCDAABBB') --> A B C D
# unique_everseen('ABBCcAD', str.lower) --> A B C D
seen = set()
seen_add = seen.add
if key is None:
for element in filterfalse(seen.__contains__, iterable):
seen_add(element)
yield element
else:
for element in iterable:
k = key(element)
if k not in seen:
seen_add(k)
yield element
def dupfilter(file_initial, file_final):
with open(file_initial) as in_file, open(file_final, "w") as out_file:
out_file.writelines(unique_everseen(in_file))
You should not use global variables unless you really need to. Instead, pass relevant parameters as arguments to the functions and return the results. This makes them actually reusable. Currently you have two functions to count the initial and final number of words, when you could just have a word_count
function:
def wordcount(file_name):
"""count the number of words in the file"""
with open(file_name) as f:
return sum(len(line.split()) for line in f)
def dupfilter(file_initial, file_final):
with open(file_initial) as in_file, open(file_final, "w") as out_file:
out_file.writelines(set(in_file.readlines()))
if __name__ == '__main__':
while True:
file_initial = input("What is the name of the text?")
file_final = input("How do you want to name the filtered file?")
if file_initial and file_final and file_initial != file_final:
break
num_words_initial = wordcount(file_initial)
dupfilter(file_initial, file_final)
num_words_final = wordcount(file_final)
print("Number of duplicates filtered:", num_words_initial - num_words_final)
input("nPress <ENTER> to quit the program.")
I also used sum
with a generator expression to simplify the wordcount
function, used with
to ensure that files are properly closed.
In addition, I replaced the while not ready
loop with a while True
loop and an explicit break
. This is much more common in Python, especially for user input.
Note that if file_initial and file_final != ""
is only incidentally the same as if file_initial != "" and file_final != ""
because non-empty strings are truthy. This is why it is also equivalent to if file_initial and file_final
. But for example x and y == 3
is not the same as x == 3 and y == 3
.
You also don't need to call str
on things to be printed, print
does that for you if necessary.
Note that using set
does not guarantee the same order as the original file, for that you would want to use the itertools
recipe unique_everseen
:
from itertools import filterfalse
def unique_everseen(iterable, key=None):
"List unique elements, preserving order. Remember all elements ever seen."
# unique_everseen('AAAABBBCCDAABBB') --> A B C D
# unique_everseen('ABBCcAD', str.lower) --> A B C D
seen = set()
seen_add = seen.add
if key is None:
for element in filterfalse(seen.__contains__, iterable):
seen_add(element)
yield element
else:
for element in iterable:
k = key(element)
if k not in seen:
seen_add(k)
yield element
def dupfilter(file_initial, file_final):
with open(file_initial) as in_file, open(file_final, "w") as out_file:
out_file.writelines(unique_everseen(in_file))
edited 9 hours ago
answered 9 hours ago
Graipher
22.4k53284
22.4k53284
1
Nice section on itertools instead of set!
– Julien Rousé
9 hours ago
Thanks a lot your help is really precious!
– Thewizy
7 hours ago
1
I didn't know that set could not give the same order when filtering the duplicate thanks!
– Thewizy
7 hours ago
add a comment |
1
Nice section on itertools instead of set!
– Julien Rousé
9 hours ago
Thanks a lot your help is really precious!
– Thewizy
7 hours ago
1
I didn't know that set could not give the same order when filtering the duplicate thanks!
– Thewizy
7 hours ago
1
1
Nice section on itertools instead of set!
– Julien Rousé
9 hours ago
Nice section on itertools instead of set!
– Julien Rousé
9 hours ago
Thanks a lot your help is really precious!
– Thewizy
7 hours ago
Thanks a lot your help is really precious!
– Thewizy
7 hours ago
1
1
I didn't know that set could not give the same order when filtering the duplicate thanks!
– Thewizy
7 hours ago
I didn't know that set could not give the same order when filtering the duplicate thanks!
– Thewizy
7 hours ago
add a comment |
up vote
4
down vote
Thanks for sharing your code!
I'll begin by pointing out you don't need global
variable, and you should avoid them when possible. Very often it only does make your code less readable and more fragile.
Here your function wordcount_initial
could be rewritten as: (same idea for wordcount_final
)
def wordcount_initial(input_file):
"""Return the number of words in the file given in parameter
:param input_file: A file name
:type input_file: string
:return: The number of words in the file `input_file`
:rtype: int
"""
num_words_initial = 0
with open(input_file, 'r') as f:
for line in f:
words = line.split()
num_words_initial += len(words)
return num_words_initial
There are a few changes here:
- Removed
num_words_initial
as a global and return it's value at the end of the function. It's much more clean to return value that way when you can. It also help when you want to test your functions. - Gives
input_file
as a parameter of your function instead of relying on another global. It makes your function more reusable. - And I transformed your comment in docstring that can be used to generate documentation for your code. See ReStrusctured and sphinx for more information. Ideally every function should be documented (and every module too)
Another remark, you name your function dupfilter
, but every other name in your code has the format first_last
which is a bit inconsistant. Also, don't try to gain a few letters when typing, write duplicatefilter
or better (in my opinion) filter_duplicate
.
Naming is always a bit subjective, use your best judgement.
And finally in your __main__
you could have put the logic for initialising the name of the file into another function but that's not very important.
On a more positive note I like how you laid out your code, it is well spaced, most of the name are clear when you read them and you have comment which is often important.
Nice job!
Thank a lot for your help i will try to apply all thoses advice to my futur programs!
– Thewizy
7 hours ago
add a comment |
up vote
4
down vote
Thanks for sharing your code!
I'll begin by pointing out you don't need global
variable, and you should avoid them when possible. Very often it only does make your code less readable and more fragile.
Here your function wordcount_initial
could be rewritten as: (same idea for wordcount_final
)
def wordcount_initial(input_file):
"""Return the number of words in the file given in parameter
:param input_file: A file name
:type input_file: string
:return: The number of words in the file `input_file`
:rtype: int
"""
num_words_initial = 0
with open(input_file, 'r') as f:
for line in f:
words = line.split()
num_words_initial += len(words)
return num_words_initial
There are a few changes here:
- Removed
num_words_initial
as a global and return it's value at the end of the function. It's much more clean to return value that way when you can. It also help when you want to test your functions. - Gives
input_file
as a parameter of your function instead of relying on another global. It makes your function more reusable. - And I transformed your comment in docstring that can be used to generate documentation for your code. See ReStrusctured and sphinx for more information. Ideally every function should be documented (and every module too)
Another remark, you name your function dupfilter
, but every other name in your code has the format first_last
which is a bit inconsistant. Also, don't try to gain a few letters when typing, write duplicatefilter
or better (in my opinion) filter_duplicate
.
Naming is always a bit subjective, use your best judgement.
And finally in your __main__
you could have put the logic for initialising the name of the file into another function but that's not very important.
On a more positive note I like how you laid out your code, it is well spaced, most of the name are clear when you read them and you have comment which is often important.
Nice job!
Thank a lot for your help i will try to apply all thoses advice to my futur programs!
– Thewizy
7 hours ago
add a comment |
up vote
4
down vote
up vote
4
down vote
Thanks for sharing your code!
I'll begin by pointing out you don't need global
variable, and you should avoid them when possible. Very often it only does make your code less readable and more fragile.
Here your function wordcount_initial
could be rewritten as: (same idea for wordcount_final
)
def wordcount_initial(input_file):
"""Return the number of words in the file given in parameter
:param input_file: A file name
:type input_file: string
:return: The number of words in the file `input_file`
:rtype: int
"""
num_words_initial = 0
with open(input_file, 'r') as f:
for line in f:
words = line.split()
num_words_initial += len(words)
return num_words_initial
There are a few changes here:
- Removed
num_words_initial
as a global and return it's value at the end of the function. It's much more clean to return value that way when you can. It also help when you want to test your functions. - Gives
input_file
as a parameter of your function instead of relying on another global. It makes your function more reusable. - And I transformed your comment in docstring that can be used to generate documentation for your code. See ReStrusctured and sphinx for more information. Ideally every function should be documented (and every module too)
Another remark, you name your function dupfilter
, but every other name in your code has the format first_last
which is a bit inconsistant. Also, don't try to gain a few letters when typing, write duplicatefilter
or better (in my opinion) filter_duplicate
.
Naming is always a bit subjective, use your best judgement.
And finally in your __main__
you could have put the logic for initialising the name of the file into another function but that's not very important.
On a more positive note I like how you laid out your code, it is well spaced, most of the name are clear when you read them and you have comment which is often important.
Nice job!
Thanks for sharing your code!
I'll begin by pointing out you don't need global
variable, and you should avoid them when possible. Very often it only does make your code less readable and more fragile.
Here your function wordcount_initial
could be rewritten as: (same idea for wordcount_final
)
def wordcount_initial(input_file):
"""Return the number of words in the file given in parameter
:param input_file: A file name
:type input_file: string
:return: The number of words in the file `input_file`
:rtype: int
"""
num_words_initial = 0
with open(input_file, 'r') as f:
for line in f:
words = line.split()
num_words_initial += len(words)
return num_words_initial
There are a few changes here:
- Removed
num_words_initial
as a global and return it's value at the end of the function. It's much more clean to return value that way when you can. It also help when you want to test your functions. - Gives
input_file
as a parameter of your function instead of relying on another global. It makes your function more reusable. - And I transformed your comment in docstring that can be used to generate documentation for your code. See ReStrusctured and sphinx for more information. Ideally every function should be documented (and every module too)
Another remark, you name your function dupfilter
, but every other name in your code has the format first_last
which is a bit inconsistant. Also, don't try to gain a few letters when typing, write duplicatefilter
or better (in my opinion) filter_duplicate
.
Naming is always a bit subjective, use your best judgement.
And finally in your __main__
you could have put the logic for initialising the name of the file into another function but that's not very important.
On a more positive note I like how you laid out your code, it is well spaced, most of the name are clear when you read them and you have comment which is often important.
Nice job!
edited 8 hours ago
answered 9 hours ago
Julien Rousé
544416
544416
Thank a lot for your help i will try to apply all thoses advice to my futur programs!
– Thewizy
7 hours ago
add a comment |
Thank a lot for your help i will try to apply all thoses advice to my futur programs!
– Thewizy
7 hours ago
Thank a lot for your help i will try to apply all thoses advice to my futur programs!
– Thewizy
7 hours ago
Thank a lot for your help i will try to apply all thoses advice to my futur programs!
– Thewizy
7 hours ago
add a comment |
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f208508%2ftext-duplicate-filter-in-python-3-7%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown