Text Duplicate filter in Python 3.7











up vote
5
down vote

favorite












I wrote this program just to remove duplicates from wordlist used for brute-force, but it can be used for every txt file i guess.
It is a pretty long program for what it does, and I'm sure there is a way I can make it easier and better for everyone to look at.
I'm still a beginner in python.



def wordcount_initial():  # count the numbre of words in the input file
global num_words_initial

with open(file_initial, 'r') as f:
for line in f:
words = line.split()
num_words_initial += len(words)


def dupfilter():
content = open(file_initial, "r").readlines()
content_set = set(content) # filter duplicates
cleandata = open(file_final, "w+") # write the text filtered in the new file

for line in content_set:
cleandata.write(line)


def wordcount_final(): # count number of words in the new file
global num_words_final

with open(file_final, 'r') as f:
for line in f:
words = line.split()
num_words_final += len(words)


if __name__ == '__main__':
num_words = 0
num_words_initial = 0
num_words_final = 0

ready = False
while not ready:
file_initial = input("What is the name of the text?")

file_final = input("How do you want to name the filted file?")

if file_initial and file_final != "":

wordcount_initial()
ready = True

dupfilter()

wordcount_final()

print("Number of duplicates filtered:" + str(num_words_initial - num_words_final))
input("nPress <ENTER> to quit the program.")









share|improve this question




























    up vote
    5
    down vote

    favorite












    I wrote this program just to remove duplicates from wordlist used for brute-force, but it can be used for every txt file i guess.
    It is a pretty long program for what it does, and I'm sure there is a way I can make it easier and better for everyone to look at.
    I'm still a beginner in python.



    def wordcount_initial():  # count the numbre of words in the input file
    global num_words_initial

    with open(file_initial, 'r') as f:
    for line in f:
    words = line.split()
    num_words_initial += len(words)


    def dupfilter():
    content = open(file_initial, "r").readlines()
    content_set = set(content) # filter duplicates
    cleandata = open(file_final, "w+") # write the text filtered in the new file

    for line in content_set:
    cleandata.write(line)


    def wordcount_final(): # count number of words in the new file
    global num_words_final

    with open(file_final, 'r') as f:
    for line in f:
    words = line.split()
    num_words_final += len(words)


    if __name__ == '__main__':
    num_words = 0
    num_words_initial = 0
    num_words_final = 0

    ready = False
    while not ready:
    file_initial = input("What is the name of the text?")

    file_final = input("How do you want to name the filted file?")

    if file_initial and file_final != "":

    wordcount_initial()
    ready = True

    dupfilter()

    wordcount_final()

    print("Number of duplicates filtered:" + str(num_words_initial - num_words_final))
    input("nPress <ENTER> to quit the program.")









    share|improve this question


























      up vote
      5
      down vote

      favorite









      up vote
      5
      down vote

      favorite











      I wrote this program just to remove duplicates from wordlist used for brute-force, but it can be used for every txt file i guess.
      It is a pretty long program for what it does, and I'm sure there is a way I can make it easier and better for everyone to look at.
      I'm still a beginner in python.



      def wordcount_initial():  # count the numbre of words in the input file
      global num_words_initial

      with open(file_initial, 'r') as f:
      for line in f:
      words = line.split()
      num_words_initial += len(words)


      def dupfilter():
      content = open(file_initial, "r").readlines()
      content_set = set(content) # filter duplicates
      cleandata = open(file_final, "w+") # write the text filtered in the new file

      for line in content_set:
      cleandata.write(line)


      def wordcount_final(): # count number of words in the new file
      global num_words_final

      with open(file_final, 'r') as f:
      for line in f:
      words = line.split()
      num_words_final += len(words)


      if __name__ == '__main__':
      num_words = 0
      num_words_initial = 0
      num_words_final = 0

      ready = False
      while not ready:
      file_initial = input("What is the name of the text?")

      file_final = input("How do you want to name the filted file?")

      if file_initial and file_final != "":

      wordcount_initial()
      ready = True

      dupfilter()

      wordcount_final()

      print("Number of duplicates filtered:" + str(num_words_initial - num_words_final))
      input("nPress <ENTER> to quit the program.")









      share|improve this question















      I wrote this program just to remove duplicates from wordlist used for brute-force, but it can be used for every txt file i guess.
      It is a pretty long program for what it does, and I'm sure there is a way I can make it easier and better for everyone to look at.
      I'm still a beginner in python.



      def wordcount_initial():  # count the numbre of words in the input file
      global num_words_initial

      with open(file_initial, 'r') as f:
      for line in f:
      words = line.split()
      num_words_initial += len(words)


      def dupfilter():
      content = open(file_initial, "r").readlines()
      content_set = set(content) # filter duplicates
      cleandata = open(file_final, "w+") # write the text filtered in the new file

      for line in content_set:
      cleandata.write(line)


      def wordcount_final(): # count number of words in the new file
      global num_words_final

      with open(file_final, 'r') as f:
      for line in f:
      words = line.split()
      num_words_final += len(words)


      if __name__ == '__main__':
      num_words = 0
      num_words_initial = 0
      num_words_final = 0

      ready = False
      while not ready:
      file_initial = input("What is the name of the text?")

      file_final = input("How do you want to name the filted file?")

      if file_initial and file_final != "":

      wordcount_initial()
      ready = True

      dupfilter()

      wordcount_final()

      print("Number of duplicates filtered:" + str(num_words_initial - num_words_final))
      input("nPress <ENTER> to quit the program.")






      python beginner python-3.x file






      share|improve this question















      share|improve this question













      share|improve this question




      share|improve this question








      edited 10 hours ago









      200_success

      127k15148411




      127k15148411










      asked 12 hours ago









      Thewizy

      426




      426






















          2 Answers
          2






          active

          oldest

          votes

















          up vote
          6
          down vote



          accepted










          You should not use global variables unless you really need to. Instead, pass relevant parameters as arguments to the functions and return the results. This makes them actually reusable. Currently you have two functions to count the initial and final number of words, when you could just have a word_count function:



          def wordcount(file_name):
          """count the number of words in the file"""
          with open(file_name) as f:
          return sum(len(line.split()) for line in f)

          def dupfilter(file_initial, file_final):
          with open(file_initial) as in_file, open(file_final, "w") as out_file:
          out_file.writelines(set(in_file.readlines()))

          if __name__ == '__main__':
          while True:
          file_initial = input("What is the name of the text?")
          file_final = input("How do you want to name the filtered file?")
          if file_initial and file_final and file_initial != file_final:
          break

          num_words_initial = wordcount(file_initial)
          dupfilter(file_initial, file_final)
          num_words_final = wordcount(file_final)

          print("Number of duplicates filtered:", num_words_initial - num_words_final)
          input("nPress <ENTER> to quit the program.")


          I also used sum with a generator expression to simplify the wordcount function, used with to ensure that files are properly closed.
          In addition, I replaced the while not ready loop with a while True loop and an explicit break. This is much more common in Python, especially for user input.



          Note that if file_initial and file_final != "" is only incidentally the same as if file_initial != "" and file_final != "" because non-empty strings are truthy. This is why it is also equivalent to if file_initial and file_final. But for example x and y == 3 is not the same as x == 3 and y == 3.



          You also don't need to call str on things to be printed, print does that for you if necessary.





          Note that using set does not guarantee the same order as the original file, for that you would want to use the itertools recipe unique_everseen:




          from itertools import filterfalse

          def unique_everseen(iterable, key=None):
          "List unique elements, preserving order. Remember all elements ever seen."
          # unique_everseen('AAAABBBCCDAABBB') --> A B C D
          # unique_everseen('ABBCcAD', str.lower) --> A B C D
          seen = set()
          seen_add = seen.add
          if key is None:
          for element in filterfalse(seen.__contains__, iterable):
          seen_add(element)
          yield element
          else:
          for element in iterable:
          k = key(element)
          if k not in seen:
          seen_add(k)
          yield element



          def dupfilter(file_initial, file_final):
          with open(file_initial) as in_file, open(file_final, "w") as out_file:
          out_file.writelines(unique_everseen(in_file))





          share|improve this answer



















          • 1




            Nice section on itertools instead of set!
            – Julien Rousé
            9 hours ago










          • Thanks a lot your help is really precious!
            – Thewizy
            7 hours ago






          • 1




            I didn't know that set could not give the same order when filtering the duplicate thanks!
            – Thewizy
            7 hours ago


















          up vote
          4
          down vote













          Thanks for sharing your code!



          I'll begin by pointing out you don't need global variable, and you should avoid them when possible. Very often it only does make your code less readable and more fragile.



          Here your function wordcount_initial could be rewritten as: (same idea for wordcount_final)



          def wordcount_initial(input_file):
          """Return the number of words in the file given in parameter

          :param input_file: A file name
          :type input_file: string
          :return: The number of words in the file `input_file`
          :rtype: int
          """
          num_words_initial = 0
          with open(input_file, 'r') as f:
          for line in f:
          words = line.split()
          num_words_initial += len(words)
          return num_words_initial


          There are a few changes here:




          • Removed num_words_initial as a global and return it's value at the end of the function. It's much more clean to return value that way when you can. It also help when you want to test your functions.

          • Gives input_file as a parameter of your function instead of relying on another global. It makes your function more reusable.

          • And I transformed your comment in docstring that can be used to generate documentation for your code. See ReStrusctured and sphinx for more information. Ideally every function should be documented (and every module too)


          Another remark, you name your function dupfilter, but every other name in your code has the format first_last which is a bit inconsistant. Also, don't try to gain a few letters when typing, write duplicatefilter or better (in my opinion) filter_duplicate.



          Naming is always a bit subjective, use your best judgement.



          And finally in your __main__ you could have put the logic for initialising the name of the file into another function but that's not very important.



          On a more positive note I like how you laid out your code, it is well spaced, most of the name are clear when you read them and you have comment which is often important.



          Nice job!






          share|improve this answer























          • Thank a lot for your help i will try to apply all thoses advice to my futur programs!
            – Thewizy
            7 hours ago











          Your Answer





          StackExchange.ifUsing("editor", function () {
          return StackExchange.using("mathjaxEditing", function () {
          StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix) {
          StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
          });
          });
          }, "mathjax-editing");

          StackExchange.ifUsing("editor", function () {
          StackExchange.using("externalEditor", function () {
          StackExchange.using("snippets", function () {
          StackExchange.snippets.init();
          });
          });
          }, "code-snippets");

          StackExchange.ready(function() {
          var channelOptions = {
          tags: "".split(" "),
          id: "196"
          };
          initTagRenderer("".split(" "), "".split(" "), channelOptions);

          StackExchange.using("externalEditor", function() {
          // Have to fire editor after snippets, if snippets enabled
          if (StackExchange.settings.snippets.snippetsEnabled) {
          StackExchange.using("snippets", function() {
          createEditor();
          });
          }
          else {
          createEditor();
          }
          });

          function createEditor() {
          StackExchange.prepareEditor({
          heartbeatType: 'answer',
          convertImagesToLinks: false,
          noModals: true,
          showLowRepImageUploadWarning: true,
          reputationToPostImages: null,
          bindNavPrevention: true,
          postfix: "",
          imageUploader: {
          brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
          contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
          allowUrls: true
          },
          onDemand: true,
          discardSelector: ".discard-answer"
          ,immediatelyShowMarkdownHelp:true
          });


          }
          });














           

          draft saved


          draft discarded


















          StackExchange.ready(
          function () {
          StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f208508%2ftext-duplicate-filter-in-python-3-7%23new-answer', 'question_page');
          }
          );

          Post as a guest















          Required, but never shown

























          2 Answers
          2






          active

          oldest

          votes








          2 Answers
          2






          active

          oldest

          votes









          active

          oldest

          votes






          active

          oldest

          votes








          up vote
          6
          down vote



          accepted










          You should not use global variables unless you really need to. Instead, pass relevant parameters as arguments to the functions and return the results. This makes them actually reusable. Currently you have two functions to count the initial and final number of words, when you could just have a word_count function:



          def wordcount(file_name):
          """count the number of words in the file"""
          with open(file_name) as f:
          return sum(len(line.split()) for line in f)

          def dupfilter(file_initial, file_final):
          with open(file_initial) as in_file, open(file_final, "w") as out_file:
          out_file.writelines(set(in_file.readlines()))

          if __name__ == '__main__':
          while True:
          file_initial = input("What is the name of the text?")
          file_final = input("How do you want to name the filtered file?")
          if file_initial and file_final and file_initial != file_final:
          break

          num_words_initial = wordcount(file_initial)
          dupfilter(file_initial, file_final)
          num_words_final = wordcount(file_final)

          print("Number of duplicates filtered:", num_words_initial - num_words_final)
          input("nPress <ENTER> to quit the program.")


          I also used sum with a generator expression to simplify the wordcount function, used with to ensure that files are properly closed.
          In addition, I replaced the while not ready loop with a while True loop and an explicit break. This is much more common in Python, especially for user input.



          Note that if file_initial and file_final != "" is only incidentally the same as if file_initial != "" and file_final != "" because non-empty strings are truthy. This is why it is also equivalent to if file_initial and file_final. But for example x and y == 3 is not the same as x == 3 and y == 3.



          You also don't need to call str on things to be printed, print does that for you if necessary.





          Note that using set does not guarantee the same order as the original file, for that you would want to use the itertools recipe unique_everseen:




          from itertools import filterfalse

          def unique_everseen(iterable, key=None):
          "List unique elements, preserving order. Remember all elements ever seen."
          # unique_everseen('AAAABBBCCDAABBB') --> A B C D
          # unique_everseen('ABBCcAD', str.lower) --> A B C D
          seen = set()
          seen_add = seen.add
          if key is None:
          for element in filterfalse(seen.__contains__, iterable):
          seen_add(element)
          yield element
          else:
          for element in iterable:
          k = key(element)
          if k not in seen:
          seen_add(k)
          yield element



          def dupfilter(file_initial, file_final):
          with open(file_initial) as in_file, open(file_final, "w") as out_file:
          out_file.writelines(unique_everseen(in_file))





          share|improve this answer



















          • 1




            Nice section on itertools instead of set!
            – Julien Rousé
            9 hours ago










          • Thanks a lot your help is really precious!
            – Thewizy
            7 hours ago






          • 1




            I didn't know that set could not give the same order when filtering the duplicate thanks!
            – Thewizy
            7 hours ago















          up vote
          6
          down vote



          accepted










          You should not use global variables unless you really need to. Instead, pass relevant parameters as arguments to the functions and return the results. This makes them actually reusable. Currently you have two functions to count the initial and final number of words, when you could just have a word_count function:



          def wordcount(file_name):
          """count the number of words in the file"""
          with open(file_name) as f:
          return sum(len(line.split()) for line in f)

          def dupfilter(file_initial, file_final):
          with open(file_initial) as in_file, open(file_final, "w") as out_file:
          out_file.writelines(set(in_file.readlines()))

          if __name__ == '__main__':
          while True:
          file_initial = input("What is the name of the text?")
          file_final = input("How do you want to name the filtered file?")
          if file_initial and file_final and file_initial != file_final:
          break

          num_words_initial = wordcount(file_initial)
          dupfilter(file_initial, file_final)
          num_words_final = wordcount(file_final)

          print("Number of duplicates filtered:", num_words_initial - num_words_final)
          input("nPress <ENTER> to quit the program.")


          I also used sum with a generator expression to simplify the wordcount function, used with to ensure that files are properly closed.
          In addition, I replaced the while not ready loop with a while True loop and an explicit break. This is much more common in Python, especially for user input.



          Note that if file_initial and file_final != "" is only incidentally the same as if file_initial != "" and file_final != "" because non-empty strings are truthy. This is why it is also equivalent to if file_initial and file_final. But for example x and y == 3 is not the same as x == 3 and y == 3.



          You also don't need to call str on things to be printed, print does that for you if necessary.





          Note that using set does not guarantee the same order as the original file, for that you would want to use the itertools recipe unique_everseen:




          from itertools import filterfalse

          def unique_everseen(iterable, key=None):
          "List unique elements, preserving order. Remember all elements ever seen."
          # unique_everseen('AAAABBBCCDAABBB') --> A B C D
          # unique_everseen('ABBCcAD', str.lower) --> A B C D
          seen = set()
          seen_add = seen.add
          if key is None:
          for element in filterfalse(seen.__contains__, iterable):
          seen_add(element)
          yield element
          else:
          for element in iterable:
          k = key(element)
          if k not in seen:
          seen_add(k)
          yield element



          def dupfilter(file_initial, file_final):
          with open(file_initial) as in_file, open(file_final, "w") as out_file:
          out_file.writelines(unique_everseen(in_file))





          share|improve this answer



















          • 1




            Nice section on itertools instead of set!
            – Julien Rousé
            9 hours ago










          • Thanks a lot your help is really precious!
            – Thewizy
            7 hours ago






          • 1




            I didn't know that set could not give the same order when filtering the duplicate thanks!
            – Thewizy
            7 hours ago













          up vote
          6
          down vote



          accepted







          up vote
          6
          down vote



          accepted






          You should not use global variables unless you really need to. Instead, pass relevant parameters as arguments to the functions and return the results. This makes them actually reusable. Currently you have two functions to count the initial and final number of words, when you could just have a word_count function:



          def wordcount(file_name):
          """count the number of words in the file"""
          with open(file_name) as f:
          return sum(len(line.split()) for line in f)

          def dupfilter(file_initial, file_final):
          with open(file_initial) as in_file, open(file_final, "w") as out_file:
          out_file.writelines(set(in_file.readlines()))

          if __name__ == '__main__':
          while True:
          file_initial = input("What is the name of the text?")
          file_final = input("How do you want to name the filtered file?")
          if file_initial and file_final and file_initial != file_final:
          break

          num_words_initial = wordcount(file_initial)
          dupfilter(file_initial, file_final)
          num_words_final = wordcount(file_final)

          print("Number of duplicates filtered:", num_words_initial - num_words_final)
          input("nPress <ENTER> to quit the program.")


          I also used sum with a generator expression to simplify the wordcount function, used with to ensure that files are properly closed.
          In addition, I replaced the while not ready loop with a while True loop and an explicit break. This is much more common in Python, especially for user input.



          Note that if file_initial and file_final != "" is only incidentally the same as if file_initial != "" and file_final != "" because non-empty strings are truthy. This is why it is also equivalent to if file_initial and file_final. But for example x and y == 3 is not the same as x == 3 and y == 3.



          You also don't need to call str on things to be printed, print does that for you if necessary.





          Note that using set does not guarantee the same order as the original file, for that you would want to use the itertools recipe unique_everseen:




          from itertools import filterfalse

          def unique_everseen(iterable, key=None):
          "List unique elements, preserving order. Remember all elements ever seen."
          # unique_everseen('AAAABBBCCDAABBB') --> A B C D
          # unique_everseen('ABBCcAD', str.lower) --> A B C D
          seen = set()
          seen_add = seen.add
          if key is None:
          for element in filterfalse(seen.__contains__, iterable):
          seen_add(element)
          yield element
          else:
          for element in iterable:
          k = key(element)
          if k not in seen:
          seen_add(k)
          yield element



          def dupfilter(file_initial, file_final):
          with open(file_initial) as in_file, open(file_final, "w") as out_file:
          out_file.writelines(unique_everseen(in_file))





          share|improve this answer














          You should not use global variables unless you really need to. Instead, pass relevant parameters as arguments to the functions and return the results. This makes them actually reusable. Currently you have two functions to count the initial and final number of words, when you could just have a word_count function:



          def wordcount(file_name):
          """count the number of words in the file"""
          with open(file_name) as f:
          return sum(len(line.split()) for line in f)

          def dupfilter(file_initial, file_final):
          with open(file_initial) as in_file, open(file_final, "w") as out_file:
          out_file.writelines(set(in_file.readlines()))

          if __name__ == '__main__':
          while True:
          file_initial = input("What is the name of the text?")
          file_final = input("How do you want to name the filtered file?")
          if file_initial and file_final and file_initial != file_final:
          break

          num_words_initial = wordcount(file_initial)
          dupfilter(file_initial, file_final)
          num_words_final = wordcount(file_final)

          print("Number of duplicates filtered:", num_words_initial - num_words_final)
          input("nPress <ENTER> to quit the program.")


          I also used sum with a generator expression to simplify the wordcount function, used with to ensure that files are properly closed.
          In addition, I replaced the while not ready loop with a while True loop and an explicit break. This is much more common in Python, especially for user input.



          Note that if file_initial and file_final != "" is only incidentally the same as if file_initial != "" and file_final != "" because non-empty strings are truthy. This is why it is also equivalent to if file_initial and file_final. But for example x and y == 3 is not the same as x == 3 and y == 3.



          You also don't need to call str on things to be printed, print does that for you if necessary.





          Note that using set does not guarantee the same order as the original file, for that you would want to use the itertools recipe unique_everseen:




          from itertools import filterfalse

          def unique_everseen(iterable, key=None):
          "List unique elements, preserving order. Remember all elements ever seen."
          # unique_everseen('AAAABBBCCDAABBB') --> A B C D
          # unique_everseen('ABBCcAD', str.lower) --> A B C D
          seen = set()
          seen_add = seen.add
          if key is None:
          for element in filterfalse(seen.__contains__, iterable):
          seen_add(element)
          yield element
          else:
          for element in iterable:
          k = key(element)
          if k not in seen:
          seen_add(k)
          yield element



          def dupfilter(file_initial, file_final):
          with open(file_initial) as in_file, open(file_final, "w") as out_file:
          out_file.writelines(unique_everseen(in_file))






          share|improve this answer














          share|improve this answer



          share|improve this answer








          edited 9 hours ago

























          answered 9 hours ago









          Graipher

          22.4k53284




          22.4k53284








          • 1




            Nice section on itertools instead of set!
            – Julien Rousé
            9 hours ago










          • Thanks a lot your help is really precious!
            – Thewizy
            7 hours ago






          • 1




            I didn't know that set could not give the same order when filtering the duplicate thanks!
            – Thewizy
            7 hours ago














          • 1




            Nice section on itertools instead of set!
            – Julien Rousé
            9 hours ago










          • Thanks a lot your help is really precious!
            – Thewizy
            7 hours ago






          • 1




            I didn't know that set could not give the same order when filtering the duplicate thanks!
            – Thewizy
            7 hours ago








          1




          1




          Nice section on itertools instead of set!
          – Julien Rousé
          9 hours ago




          Nice section on itertools instead of set!
          – Julien Rousé
          9 hours ago












          Thanks a lot your help is really precious!
          – Thewizy
          7 hours ago




          Thanks a lot your help is really precious!
          – Thewizy
          7 hours ago




          1




          1




          I didn't know that set could not give the same order when filtering the duplicate thanks!
          – Thewizy
          7 hours ago




          I didn't know that set could not give the same order when filtering the duplicate thanks!
          – Thewizy
          7 hours ago












          up vote
          4
          down vote













          Thanks for sharing your code!



          I'll begin by pointing out you don't need global variable, and you should avoid them when possible. Very often it only does make your code less readable and more fragile.



          Here your function wordcount_initial could be rewritten as: (same idea for wordcount_final)



          def wordcount_initial(input_file):
          """Return the number of words in the file given in parameter

          :param input_file: A file name
          :type input_file: string
          :return: The number of words in the file `input_file`
          :rtype: int
          """
          num_words_initial = 0
          with open(input_file, 'r') as f:
          for line in f:
          words = line.split()
          num_words_initial += len(words)
          return num_words_initial


          There are a few changes here:




          • Removed num_words_initial as a global and return it's value at the end of the function. It's much more clean to return value that way when you can. It also help when you want to test your functions.

          • Gives input_file as a parameter of your function instead of relying on another global. It makes your function more reusable.

          • And I transformed your comment in docstring that can be used to generate documentation for your code. See ReStrusctured and sphinx for more information. Ideally every function should be documented (and every module too)


          Another remark, you name your function dupfilter, but every other name in your code has the format first_last which is a bit inconsistant. Also, don't try to gain a few letters when typing, write duplicatefilter or better (in my opinion) filter_duplicate.



          Naming is always a bit subjective, use your best judgement.



          And finally in your __main__ you could have put the logic for initialising the name of the file into another function but that's not very important.



          On a more positive note I like how you laid out your code, it is well spaced, most of the name are clear when you read them and you have comment which is often important.



          Nice job!






          share|improve this answer























          • Thank a lot for your help i will try to apply all thoses advice to my futur programs!
            – Thewizy
            7 hours ago















          up vote
          4
          down vote













          Thanks for sharing your code!



          I'll begin by pointing out you don't need global variable, and you should avoid them when possible. Very often it only does make your code less readable and more fragile.



          Here your function wordcount_initial could be rewritten as: (same idea for wordcount_final)



          def wordcount_initial(input_file):
          """Return the number of words in the file given in parameter

          :param input_file: A file name
          :type input_file: string
          :return: The number of words in the file `input_file`
          :rtype: int
          """
          num_words_initial = 0
          with open(input_file, 'r') as f:
          for line in f:
          words = line.split()
          num_words_initial += len(words)
          return num_words_initial


          There are a few changes here:




          • Removed num_words_initial as a global and return it's value at the end of the function. It's much more clean to return value that way when you can. It also help when you want to test your functions.

          • Gives input_file as a parameter of your function instead of relying on another global. It makes your function more reusable.

          • And I transformed your comment in docstring that can be used to generate documentation for your code. See ReStrusctured and sphinx for more information. Ideally every function should be documented (and every module too)


          Another remark, you name your function dupfilter, but every other name in your code has the format first_last which is a bit inconsistant. Also, don't try to gain a few letters when typing, write duplicatefilter or better (in my opinion) filter_duplicate.



          Naming is always a bit subjective, use your best judgement.



          And finally in your __main__ you could have put the logic for initialising the name of the file into another function but that's not very important.



          On a more positive note I like how you laid out your code, it is well spaced, most of the name are clear when you read them and you have comment which is often important.



          Nice job!






          share|improve this answer























          • Thank a lot for your help i will try to apply all thoses advice to my futur programs!
            – Thewizy
            7 hours ago













          up vote
          4
          down vote










          up vote
          4
          down vote









          Thanks for sharing your code!



          I'll begin by pointing out you don't need global variable, and you should avoid them when possible. Very often it only does make your code less readable and more fragile.



          Here your function wordcount_initial could be rewritten as: (same idea for wordcount_final)



          def wordcount_initial(input_file):
          """Return the number of words in the file given in parameter

          :param input_file: A file name
          :type input_file: string
          :return: The number of words in the file `input_file`
          :rtype: int
          """
          num_words_initial = 0
          with open(input_file, 'r') as f:
          for line in f:
          words = line.split()
          num_words_initial += len(words)
          return num_words_initial


          There are a few changes here:




          • Removed num_words_initial as a global and return it's value at the end of the function. It's much more clean to return value that way when you can. It also help when you want to test your functions.

          • Gives input_file as a parameter of your function instead of relying on another global. It makes your function more reusable.

          • And I transformed your comment in docstring that can be used to generate documentation for your code. See ReStrusctured and sphinx for more information. Ideally every function should be documented (and every module too)


          Another remark, you name your function dupfilter, but every other name in your code has the format first_last which is a bit inconsistant. Also, don't try to gain a few letters when typing, write duplicatefilter or better (in my opinion) filter_duplicate.



          Naming is always a bit subjective, use your best judgement.



          And finally in your __main__ you could have put the logic for initialising the name of the file into another function but that's not very important.



          On a more positive note I like how you laid out your code, it is well spaced, most of the name are clear when you read them and you have comment which is often important.



          Nice job!






          share|improve this answer














          Thanks for sharing your code!



          I'll begin by pointing out you don't need global variable, and you should avoid them when possible. Very often it only does make your code less readable and more fragile.



          Here your function wordcount_initial could be rewritten as: (same idea for wordcount_final)



          def wordcount_initial(input_file):
          """Return the number of words in the file given in parameter

          :param input_file: A file name
          :type input_file: string
          :return: The number of words in the file `input_file`
          :rtype: int
          """
          num_words_initial = 0
          with open(input_file, 'r') as f:
          for line in f:
          words = line.split()
          num_words_initial += len(words)
          return num_words_initial


          There are a few changes here:




          • Removed num_words_initial as a global and return it's value at the end of the function. It's much more clean to return value that way when you can. It also help when you want to test your functions.

          • Gives input_file as a parameter of your function instead of relying on another global. It makes your function more reusable.

          • And I transformed your comment in docstring that can be used to generate documentation for your code. See ReStrusctured and sphinx for more information. Ideally every function should be documented (and every module too)


          Another remark, you name your function dupfilter, but every other name in your code has the format first_last which is a bit inconsistant. Also, don't try to gain a few letters when typing, write duplicatefilter or better (in my opinion) filter_duplicate.



          Naming is always a bit subjective, use your best judgement.



          And finally in your __main__ you could have put the logic for initialising the name of the file into another function but that's not very important.



          On a more positive note I like how you laid out your code, it is well spaced, most of the name are clear when you read them and you have comment which is often important.



          Nice job!







          share|improve this answer














          share|improve this answer



          share|improve this answer








          edited 8 hours ago

























          answered 9 hours ago









          Julien Rousé

          544416




          544416












          • Thank a lot for your help i will try to apply all thoses advice to my futur programs!
            – Thewizy
            7 hours ago


















          • Thank a lot for your help i will try to apply all thoses advice to my futur programs!
            – Thewizy
            7 hours ago
















          Thank a lot for your help i will try to apply all thoses advice to my futur programs!
          – Thewizy
          7 hours ago




          Thank a lot for your help i will try to apply all thoses advice to my futur programs!
          – Thewizy
          7 hours ago


















           

          draft saved


          draft discarded



















































           


          draft saved


          draft discarded














          StackExchange.ready(
          function () {
          StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f208508%2ftext-duplicate-filter-in-python-3-7%23new-answer', 'question_page');
          }
          );

          Post as a guest















          Required, but never shown





















































          Required, but never shown














          Required, but never shown












          Required, but never shown







          Required, but never shown

































          Required, but never shown














          Required, but never shown












          Required, but never shown







          Required, but never shown







          Popular posts from this blog

          Accessing regular linux commands in Huawei's Dopra Linux

          Can't connect RFCOMM socket: Host is down

          Kernel panic - not syncing: Fatal Exception in Interrupt