[PATCH 4/6] tests/acceptance: Introduce tesseract_ocr() helper

Philippe Mathieu-Daudé posted 6 patches 5 years, 3 months ago
[PATCH 4/6] tests/acceptance: Introduce tesseract_ocr() helper
Posted by Philippe Mathieu-Daudé 5 years, 3 months ago
We are going to reuse the tesseract OCR code.
Create a new tesseract_ocr() helper and use it.

Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
---
 tests/acceptance/machine_m68k_nextcube.py | 21 +++++----------------
 tests/acceptance/tesseract_utils.py       | 18 ++++++++++++++++++
 2 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/tests/acceptance/machine_m68k_nextcube.py b/tests/acceptance/machine_m68k_nextcube.py
index 3c7400c43e4..09e2745cc52 100644
--- a/tests/acceptance/machine_m68k_nextcube.py
+++ b/tests/acceptance/machine_m68k_nextcube.py
@@ -7,13 +7,11 @@
 
 import os
 import time
-import logging
 
 from avocado_qemu import Test
 from avocado import skipUnless
-from avocado.utils import process
 
-from tesseract_utils import tesseract_available
+from tesseract_utils import tesseract_available, tesseract_ocr
 
 PIL_AVAILABLE = True
 try:
@@ -61,12 +59,8 @@ def test_bootrom_framebuffer_size(self):
     def test_bootrom_framebuffer_ocr_with_tesseract_v3(self):
         screenshot_path = os.path.join(self.workdir, "dump.ppm")
         self.check_bootrom_framebuffer(screenshot_path)
-
-        console_logger = logging.getLogger('console')
-        text = process.run("tesseract %s stdout" % screenshot_path).stdout_text
-        for line in text.split('\n'):
-            if len(line):
-                console_logger.debug(line)
+        lines = tesseract_ocr(screenshot_path, tesseract_version=3)
+        text = '\n'.join(lines)
         self.assertIn('Backplane', text)
         self.assertIn('Ethernet address', text)
 
@@ -77,13 +71,8 @@ def test_bootrom_framebuffer_ocr_with_tesseract_v3(self):
     def test_bootrom_framebuffer_ocr_with_tesseract_v4(self):
         screenshot_path = os.path.join(self.workdir, "dump.ppm")
         self.check_bootrom_framebuffer(screenshot_path)
-
-        console_logger = logging.getLogger('console')
-        proc = process.run("tesseract --oem 1 %s stdout" % screenshot_path)
-        text = proc.stdout_text
-        for line in text.split('\n'):
-            if len(line):
-                console_logger.debug(line)
+        lines = tesseract_ocr(screenshot_path, tesseract_version=4)
+        text = '\n'.join(lines)
         self.assertIn('Testing the FPU, SCC', text)
         self.assertIn('System test failed. Error code', text)
         self.assertIn('Boot command', text)
diff --git a/tests/acceptance/tesseract_utils.py b/tests/acceptance/tesseract_utils.py
index acd6e8c2faa..72cd9ab7989 100644
--- a/tests/acceptance/tesseract_utils.py
+++ b/tests/acceptance/tesseract_utils.py
@@ -6,7 +6,9 @@
 # later. See the COPYING file in the top-level directory.
 
 import re
+import logging
 
+from avocado.utils import process
 from avocado.utils.path import find_command, CmdNotFoundError
 
 def tesseract_available(expected_version):
@@ -26,3 +28,19 @@ def tesseract_available(expected_version):
         return False
     # now this is guaranteed to be a digit
     return int(match.groups()[0]) == expected_version
+
+
+def tesseract_ocr(image_path, tesseract_args='', tesseract_version=3):
+    console_logger = logging.getLogger('tesseract')
+    console_logger.debug(image_path)
+    if tesseract_version == 4:
+        tesseract_args += ' --oem 1'
+    proc = process.run("tesseract {} {} stdout".format(tesseract_args,
+                                                       image_path))
+    lines = []
+    for line in proc.stdout_text.split('\n'):
+        sline = line.strip()
+        if len(sline):
+            console_logger.debug(sline)
+            lines += [sline]
+    return lines
-- 
2.26.2

Re: [PATCH 4/6] tests/acceptance: Introduce tesseract_ocr() helper
Posted by Thomas Huth 5 years, 3 months ago
On 21/10/2020 12.50, Philippe Mathieu-Daudé wrote:
> We are going to reuse the tesseract OCR code.
> Create a new tesseract_ocr() helper and use it.
> 
> Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
> ---
>  tests/acceptance/machine_m68k_nextcube.py | 21 +++++----------------
>  tests/acceptance/tesseract_utils.py       | 18 ++++++++++++++++++
>  2 files changed, 23 insertions(+), 16 deletions(-)
> 
> diff --git a/tests/acceptance/machine_m68k_nextcube.py b/tests/acceptance/machine_m68k_nextcube.py
> index 3c7400c43e4..09e2745cc52 100644
> --- a/tests/acceptance/machine_m68k_nextcube.py
> +++ b/tests/acceptance/machine_m68k_nextcube.py
> @@ -7,13 +7,11 @@
>  
>  import os
>  import time
> -import logging
>  
>  from avocado_qemu import Test
>  from avocado import skipUnless
> -from avocado.utils import process
>  
> -from tesseract_utils import tesseract_available
> +from tesseract_utils import tesseract_available, tesseract_ocr
>  
>  PIL_AVAILABLE = True
>  try:
> @@ -61,12 +59,8 @@ def test_bootrom_framebuffer_size(self):
>      def test_bootrom_framebuffer_ocr_with_tesseract_v3(self):
>          screenshot_path = os.path.join(self.workdir, "dump.ppm")
>          self.check_bootrom_framebuffer(screenshot_path)
> -
> -        console_logger = logging.getLogger('console')
> -        text = process.run("tesseract %s stdout" % screenshot_path).stdout_text
> -        for line in text.split('\n'):
> -            if len(line):
> -                console_logger.debug(line)
> +        lines = tesseract_ocr(screenshot_path, tesseract_version=3)
> +        text = '\n'.join(lines)
>          self.assertIn('Backplane', text)
>          self.assertIn('Ethernet address', text)
>  
> @@ -77,13 +71,8 @@ def test_bootrom_framebuffer_ocr_with_tesseract_v3(self):
>      def test_bootrom_framebuffer_ocr_with_tesseract_v4(self):
>          screenshot_path = os.path.join(self.workdir, "dump.ppm")
>          self.check_bootrom_framebuffer(screenshot_path)
> -
> -        console_logger = logging.getLogger('console')
> -        proc = process.run("tesseract --oem 1 %s stdout" % screenshot_path)
> -        text = proc.stdout_text
> -        for line in text.split('\n'):
> -            if len(line):
> -                console_logger.debug(line)
> +        lines = tesseract_ocr(screenshot_path, tesseract_version=4)
> +        text = '\n'.join(lines)
>          self.assertIn('Testing the FPU, SCC', text)
>          self.assertIn('System test failed. Error code', text)
>          self.assertIn('Boot command', text)
> diff --git a/tests/acceptance/tesseract_utils.py b/tests/acceptance/tesseract_utils.py
> index acd6e8c2faa..72cd9ab7989 100644
> --- a/tests/acceptance/tesseract_utils.py
> +++ b/tests/acceptance/tesseract_utils.py
> @@ -6,7 +6,9 @@
>  # later. See the COPYING file in the top-level directory.
>  
>  import re
> +import logging
>  
> +from avocado.utils import process
>  from avocado.utils.path import find_command, CmdNotFoundError
>  
>  def tesseract_available(expected_version):
> @@ -26,3 +28,19 @@ def tesseract_available(expected_version):
>          return False
>      # now this is guaranteed to be a digit
>      return int(match.groups()[0]) == expected_version
> +
> +
> +def tesseract_ocr(image_path, tesseract_args='', tesseract_version=3):
> +    console_logger = logging.getLogger('tesseract')
> +    console_logger.debug(image_path)
> +    if tesseract_version == 4:
> +        tesseract_args += ' --oem 1'
> +    proc = process.run("tesseract {} {} stdout".format(tesseract_args,
> +                                                       image_path))
> +    lines = []
> +    for line in proc.stdout_text.split('\n'):
> +        sline = line.strip()
> +        if len(sline):
> +            console_logger.debug(sline)
> +            lines += [sline]
> +    return lines

Would it make sense to completely hide the tesseract version handling in
this new tesseract_utils.py file now, so that the tests themselves do not
have to worry about this anymore (i.e. would it be possible to merge
test_bootrom_framebuffer_ocr_with_tesseract_v3 and
test_bootrom_framebuffer_ocr_with_tesseract_v4 into one single test that way?)

 Thomas


Re: [PATCH 4/6] tests/acceptance: Introduce tesseract_ocr() helper
Posted by Thomas Huth 5 years, 3 months ago
On 24/10/2020 08.35, Thomas Huth wrote:
> On 21/10/2020 12.50, Philippe Mathieu-Daudé wrote:
>> We are going to reuse the tesseract OCR code.
>> Create a new tesseract_ocr() helper and use it.
>>
>> Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
>> ---
>>  tests/acceptance/machine_m68k_nextcube.py | 21 +++++----------------
>>  tests/acceptance/tesseract_utils.py       | 18 ++++++++++++++++++
>>  2 files changed, 23 insertions(+), 16 deletions(-)
>>
>> diff --git a/tests/acceptance/machine_m68k_nextcube.py b/tests/acceptance/machine_m68k_nextcube.py
>> index 3c7400c43e4..09e2745cc52 100644
>> --- a/tests/acceptance/machine_m68k_nextcube.py
>> +++ b/tests/acceptance/machine_m68k_nextcube.py
>> @@ -7,13 +7,11 @@
>>  
>>  import os
>>  import time
>> -import logging
>>  
>>  from avocado_qemu import Test
>>  from avocado import skipUnless
>> -from avocado.utils import process
>>  
>> -from tesseract_utils import tesseract_available
>> +from tesseract_utils import tesseract_available, tesseract_ocr
>>  
>>  PIL_AVAILABLE = True
>>  try:
>> @@ -61,12 +59,8 @@ def test_bootrom_framebuffer_size(self):
>>      def test_bootrom_framebuffer_ocr_with_tesseract_v3(self):
>>          screenshot_path = os.path.join(self.workdir, "dump.ppm")
>>          self.check_bootrom_framebuffer(screenshot_path)
>> -
>> -        console_logger = logging.getLogger('console')
>> -        text = process.run("tesseract %s stdout" % screenshot_path).stdout_text
>> -        for line in text.split('\n'):
>> -            if len(line):
>> -                console_logger.debug(line)
>> +        lines = tesseract_ocr(screenshot_path, tesseract_version=3)
>> +        text = '\n'.join(lines)
>>          self.assertIn('Backplane', text)
>>          self.assertIn('Ethernet address', text)
>>  
>> @@ -77,13 +71,8 @@ def test_bootrom_framebuffer_ocr_with_tesseract_v3(self):
>>      def test_bootrom_framebuffer_ocr_with_tesseract_v4(self):
>>          screenshot_path = os.path.join(self.workdir, "dump.ppm")
>>          self.check_bootrom_framebuffer(screenshot_path)
>> -
>> -        console_logger = logging.getLogger('console')
>> -        proc = process.run("tesseract --oem 1 %s stdout" % screenshot_path)
>> -        text = proc.stdout_text
>> -        for line in text.split('\n'):
>> -            if len(line):
>> -                console_logger.debug(line)
>> +        lines = tesseract_ocr(screenshot_path, tesseract_version=4)
>> +        text = '\n'.join(lines)
>>          self.assertIn('Testing the FPU, SCC', text)
>>          self.assertIn('System test failed. Error code', text)
>>          self.assertIn('Boot command', text)
>> diff --git a/tests/acceptance/tesseract_utils.py b/tests/acceptance/tesseract_utils.py
>> index acd6e8c2faa..72cd9ab7989 100644
>> --- a/tests/acceptance/tesseract_utils.py
>> +++ b/tests/acceptance/tesseract_utils.py
>> @@ -6,7 +6,9 @@
>>  # later. See the COPYING file in the top-level directory.
>>  
>>  import re
>> +import logging
>>  
>> +from avocado.utils import process
>>  from avocado.utils.path import find_command, CmdNotFoundError
>>  
>>  def tesseract_available(expected_version):
>> @@ -26,3 +28,19 @@ def tesseract_available(expected_version):
>>          return False
>>      # now this is guaranteed to be a digit
>>      return int(match.groups()[0]) == expected_version
>> +
>> +
>> +def tesseract_ocr(image_path, tesseract_args='', tesseract_version=3):
>> +    console_logger = logging.getLogger('tesseract')
>> +    console_logger.debug(image_path)
>> +    if tesseract_version == 4:
>> +        tesseract_args += ' --oem 1'
>> +    proc = process.run("tesseract {} {} stdout".format(tesseract_args,
>> +                                                       image_path))
>> +    lines = []
>> +    for line in proc.stdout_text.split('\n'):
>> +        sline = line.strip()
>> +        if len(sline):
>> +            console_logger.debug(sline)
>> +            lines += [sline]
>> +    return lines
> 
> Would it make sense to completely hide the tesseract version handling in
> this new tesseract_utils.py file now, so that the tests themselves do not
> have to worry about this anymore (i.e. would it be possible to merge
> test_bootrom_framebuffer_ocr_with_tesseract_v3 and
> test_bootrom_framebuffer_ocr_with_tesseract_v4 into one single test that way?)

If I've got that right, there is also now a proper release 4 of Tesseract,
so maybe we can simply scratch the testing with version 3 now?

 Thomas


Re: [PATCH 4/6] tests/acceptance: Introduce tesseract_ocr() helper
Posted by Philippe Mathieu-Daudé 5 years, 3 months ago
On 10/24/20 8:40 AM, Thomas Huth wrote:
> On 24/10/2020 08.35, Thomas Huth wrote:
>> On 21/10/2020 12.50, Philippe Mathieu-Daudé wrote:
>>> We are going to reuse the tesseract OCR code.
>>> Create a new tesseract_ocr() helper and use it.
>>>
>>> Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
>>> ---
>>>   tests/acceptance/machine_m68k_nextcube.py | 21 +++++----------------
>>>   tests/acceptance/tesseract_utils.py       | 18 ++++++++++++++++++
>>>   2 files changed, 23 insertions(+), 16 deletions(-)
>>>
>>> diff --git a/tests/acceptance/machine_m68k_nextcube.py b/tests/acceptance/machine_m68k_nextcube.py
>>> index 3c7400c43e4..09e2745cc52 100644
>>> --- a/tests/acceptance/machine_m68k_nextcube.py
>>> +++ b/tests/acceptance/machine_m68k_nextcube.py
>>> @@ -7,13 +7,11 @@
>>>   
>>>   import os
>>>   import time
>>> -import logging
>>>   
>>>   from avocado_qemu import Test
>>>   from avocado import skipUnless
>>> -from avocado.utils import process
>>>   
>>> -from tesseract_utils import tesseract_available
>>> +from tesseract_utils import tesseract_available, tesseract_ocr
>>>   
>>>   PIL_AVAILABLE = True
>>>   try:
>>> @@ -61,12 +59,8 @@ def test_bootrom_framebuffer_size(self):
>>>       def test_bootrom_framebuffer_ocr_with_tesseract_v3(self):
>>>           screenshot_path = os.path.join(self.workdir, "dump.ppm")
>>>           self.check_bootrom_framebuffer(screenshot_path)
>>> -
>>> -        console_logger = logging.getLogger('console')
>>> -        text = process.run("tesseract %s stdout" % screenshot_path).stdout_text
>>> -        for line in text.split('\n'):
>>> -            if len(line):
>>> -                console_logger.debug(line)
>>> +        lines = tesseract_ocr(screenshot_path, tesseract_version=3)
>>> +        text = '\n'.join(lines)
>>>           self.assertIn('Backplane', text)
>>>           self.assertIn('Ethernet address', text)
>>>   
>>> @@ -77,13 +71,8 @@ def test_bootrom_framebuffer_ocr_with_tesseract_v3(self):
>>>       def test_bootrom_framebuffer_ocr_with_tesseract_v4(self):
>>>           screenshot_path = os.path.join(self.workdir, "dump.ppm")
>>>           self.check_bootrom_framebuffer(screenshot_path)
>>> -
>>> -        console_logger = logging.getLogger('console')
>>> -        proc = process.run("tesseract --oem 1 %s stdout" % screenshot_path)
>>> -        text = proc.stdout_text
>>> -        for line in text.split('\n'):
>>> -            if len(line):
>>> -                console_logger.debug(line)
>>> +        lines = tesseract_ocr(screenshot_path, tesseract_version=4)
>>> +        text = '\n'.join(lines)
>>>           self.assertIn('Testing the FPU, SCC', text)
>>>           self.assertIn('System test failed. Error code', text)
>>>           self.assertIn('Boot command', text)
>>> diff --git a/tests/acceptance/tesseract_utils.py b/tests/acceptance/tesseract_utils.py
>>> index acd6e8c2faa..72cd9ab7989 100644
>>> --- a/tests/acceptance/tesseract_utils.py
>>> +++ b/tests/acceptance/tesseract_utils.py
>>> @@ -6,7 +6,9 @@
>>>   # later. See the COPYING file in the top-level directory.
>>>   
>>>   import re
>>> +import logging
>>>   
>>> +from avocado.utils import process
>>>   from avocado.utils.path import find_command, CmdNotFoundError
>>>   
>>>   def tesseract_available(expected_version):
>>> @@ -26,3 +28,19 @@ def tesseract_available(expected_version):
>>>           return False
>>>       # now this is guaranteed to be a digit
>>>       return int(match.groups()[0]) == expected_version
>>> +
>>> +
>>> +def tesseract_ocr(image_path, tesseract_args='', tesseract_version=3):
>>> +    console_logger = logging.getLogger('tesseract')
>>> +    console_logger.debug(image_path)
>>> +    if tesseract_version == 4:
>>> +        tesseract_args += ' --oem 1'
>>> +    proc = process.run("tesseract {} {} stdout".format(tesseract_args,
>>> +                                                       image_path))
>>> +    lines = []
>>> +    for line in proc.stdout_text.split('\n'):
>>> +        sline = line.strip()
>>> +        if len(sline):
>>> +            console_logger.debug(sline)
>>> +            lines += [sline]
>>> +    return lines
>>
>> Would it make sense to completely hide the tesseract version handling in
>> this new tesseract_utils.py file now, so that the tests themselves do not
>> have to worry about this anymore

Yes, good idea.

> (i.e. would it be possible to merge
>> test_bootrom_framebuffer_ocr_with_tesseract_v3 and
>> test_bootrom_framebuffer_ocr_with_tesseract_v4 into one single test that way?)
> 
> If I've got that right, there is also now a proper release 4 of Tesseract,
> so maybe we can simply scratch the testing with version 3 now?

Good to know, I'll have a look. Thanks!

> 
>   Thomas
> 

Re: [PATCH 4/6] tests/acceptance: Introduce tesseract_ocr() helper
Posted by Philippe Mathieu-Daudé 5 years, 3 months ago
On 10/24/20 9:37 AM, Philippe Mathieu-Daudé wrote:
> On 10/24/20 8:40 AM, Thomas Huth wrote:
>> On 24/10/2020 08.35, Thomas Huth wrote:
>>> On 21/10/2020 12.50, Philippe Mathieu-Daudé wrote:
>>>> We are going to reuse the tesseract OCR code.
>>>> Create a new tesseract_ocr() helper and use it.
>>>>
>>>> Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
>>>> ---
>>>>   tests/acceptance/machine_m68k_nextcube.py | 21 +++++----------------
>>>>   tests/acceptance/tesseract_utils.py       | 18 ++++++++++++++++++
>>>>   2 files changed, 23 insertions(+), 16 deletions(-)
...

>>>
>>> Would it make sense to completely hide the tesseract version handling in
>>> this new tesseract_utils.py file now, so that the tests themselves do 
>>> not
>>> have to worry about this anymore

The problem is the recognized strings differ between versions,
see in tests/acceptance/machine_m68k_nextcube.py:

         lines = tesseract_ocr(screenshot_path, tesseract_version=3)
         text = '\n'.join(lines)
         self.assertIn('Backplane', text)
         self.assertIn('Ethernet address', text)

and:

         lines = tesseract_ocr(screenshot_path, tesseract_version=4)
         text = '\n'.join(lines)
         self.assertIn('Testing the FPU, SCC', text)
         self.assertIn('System test failed. Error code', text)
         self.assertIn('Boot command', text)
         self.assertIn('Next>', text)

> 
> Yes, good idea.
> 
>> (i.e. would it be possible to merge
>>> test_bootrom_framebuffer_ocr_with_tesseract_v3 and
>>> test_bootrom_framebuffer_ocr_with_tesseract_v4 into one single test 
>>> that way?)
>>
>> If I've got that right, there is also now a proper release 4 of 
>> Tesseract,
>> so maybe we can simply scratch the testing with version 3 now?
> 
> Good to know, I'll have a look. Thanks!
> 
>>
>>   Thomas
>>
> 


Re: [PATCH 4/6] tests/acceptance: Introduce tesseract_ocr() helper
Posted by Thomas Huth 5 years, 3 months ago
On 24/10/2020 19.40, Philippe Mathieu-Daudé wrote:
> On 10/24/20 9:37 AM, Philippe Mathieu-Daudé wrote:
>> On 10/24/20 8:40 AM, Thomas Huth wrote:
>>> On 24/10/2020 08.35, Thomas Huth wrote:
>>>> On 21/10/2020 12.50, Philippe Mathieu-Daudé wrote:
>>>>> We are going to reuse the tesseract OCR code.
>>>>> Create a new tesseract_ocr() helper and use it.
>>>>>
>>>>> Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
>>>>> ---
>>>>>   tests/acceptance/machine_m68k_nextcube.py | 21 +++++----------------
>>>>>   tests/acceptance/tesseract_utils.py       | 18 ++++++++++++++++++
>>>>>   2 files changed, 23 insertions(+), 16 deletions(-)
> ...
> 
>>>>
>>>> Would it make sense to completely hide the tesseract version handling in
>>>> this new tesseract_utils.py file now, so that the tests themselves do not
>>>> have to worry about this anymore
> 
> The problem is the recognized strings differ between versions,
> see in tests/acceptance/machine_m68k_nextcube.py:
> 
>         lines = tesseract_ocr(screenshot_path, tesseract_version=3)
>         text = '\n'.join(lines)
>         self.assertIn('Backplane', text)
>         self.assertIn('Ethernet address', text)
> 
> and:
> 
>         lines = tesseract_ocr(screenshot_path, tesseract_version=4)
>         text = '\n'.join(lines)
>         self.assertIn('Testing the FPU, SCC', text)
>         self.assertIn('System test failed. Error code', text)
>         self.assertIn('Boot command', text)
>         self.assertIn('Next>', text)

Ah, right, I forgot about that ... well, one more reason to completely
switch to tesseract v4 now ;-)

 Thomas