Parse interlinear to csv: Difference between revisions

From 2nd Book
Jump to navigationJump to search
m
Text replacement - " " to ":"
No edit summary
m (Text replacement - " " to ":")
Tags: Mobile edit Mobile web edit
Line 13: Line 13:
# Function to parse the CSV and transform data
# Function to parse the CSV and transform data
def transform_bible_data(input_file, output_file):
def transform_bible_data(input_file, output_file):
    with open(input_file, mode='r', encoding='utf-8') as infile:
:with open(input_file, mode='r', encoding='utf-8') as infile:
        reader = csv.DictReader(infile)
::reader = csv.DictReader(infile)
       
::
        # Prepare the fieldnames for the output CSV
::# Prepare the fieldnames for the output CSV
        fieldnames = ['id', 'book', 'chapter', 'verse', 'word', 'text']
::fieldnames = ['id', 'book', 'chapter', 'verse', 'word', 'text']
       
::
        # Open the output file to write the transformed data
::# Open the output file to write the transformed data
        with open(output_file, mode='w', encoding='utf-8', newline='') as outfile:
::with open(output_file, mode='w', encoding='utf-8', newline='') as outfile:
            writer = csv.DictWriter(outfile, fieldnames=fieldnames)
:::writer = csv.DictWriter(outfile, fieldnames=fieldnames)
            writer.writeheader()
:::writer.writeheader()
           
:::
            # Process each row in the input file
:::# Process each row in the input file
            for row in reader:
:::for row in reader:
                text = row['text']
::::text = row['text']
                words = text.split()  # Split the text into words
::::words = text.split()  # Split the text into words
                for word_index, word in enumerate(words, start=1):  # Word count starts from 1
::::for word_index, word in enumerate(words, start=1):  # Word count starts from 1
                    # Create a new row for each word
:::::# Create a new row for each word
                    new_row = {
:::::new_row = {
                        'id': row['id'],
::::::'id': row['id'],
                        'book': row['book'],
::::::'book': row['book'],
                        'chapter': row['chapter'],
::::::'chapter': row['chapter'],
                        'verse': row['verse'],
::::::'verse': row['verse'],
                        'word': word_index,
::::::'word': word_index,
                        'text': word
::::::'text': word
                    }
:::::}
                    # Write the new row to the output file
:::::# Write the new row to the output file
                    writer.writerow(new_row)
:::::writer.writerow(new_row)


# Call the function to transform the data
# Call the function to transform the data
Line 48: Line 48:
How it works:
How it works:


    Reading the Input File: The script opens the input CSV file using csv.DictReader to read the rows as dictionaries.
:Reading the Input File: The script opens the input CSV file using csv.DictReader to read the rows as dictionaries.
    Splitting the Text into Words: For each row, the text field is split into individual words.
:Splitting the Text into Words: For each row, the text field is split into individual words.
    Creating New Rows: For each word in the text, a new row is created, including the original id, book, chapter, verse, the word index (count), and the word itself.
:Creating New Rows: For each word in the text, a new row is created, including the original id, book, chapter, verse, the word index (count), and the word itself.
    Writing the Output File: The transformed data is written to a new CSV file using csv.DictWriter.
:Writing the Output File: The transformed data is written to a new CSV file using csv.DictWriter.


Output:
Output:
Line 57: Line 57:
The output file virtual_bible_interlinear_by_word.csv will have columns:
The output file virtual_bible_interlinear_by_word.csv will have columns:


    id
:id
    book
:book
    chapter
:chapter
    verse
:verse
    word (Word count within the verse)
:word (Word count within the verse)
    text (The word from the original text)
:text (The word from the original text)


Example:
Example:
Line 102: Line 102:
# Function to remove nikud (diacritics) from Hebrew text
# Function to remove nikud (diacritics) from Hebrew text
def remove_nikud(hebrew_text):
def remove_nikud(hebrew_text):
    # Regular expression to remove nikud (diacritics)
:# Regular expression to remove nikud (diacritics)
    nikud_pattern = r'[\u0591-\u05C7]'
:nikud_pattern = r'[\u0591-\u05C7]'
    return re.sub(nikud_pattern, '', hebrew_text)
:return re.sub(nikud_pattern, '', hebrew_text)


# Function to parse the CSV and transform data
# Function to parse the CSV and transform data
def transform_bible_data(input_file, output_file):
def transform_bible_data(input_file, output_file):
    with open(input_file, mode='r', encoding='utf-8') as infile:
:with open(input_file, mode='r', encoding='utf-8') as infile:
        reader = csv.DictReader(infile)
::reader = csv.DictReader(infile)
       
::
        # Prepare the fieldnames for the output CSV
::# Prepare the fieldnames for the output CSV
        fieldnames = ['id', 'book', 'chapter', 'verse', 'word', 'text']
::fieldnames = ['id', 'book', 'chapter', 'verse', 'word', 'text']
       
::
        # Open the output file to write the transformed data
::# Open the output file to write the transformed data
        with open(output_file, mode='w', encoding='utf-8', newline='') as outfile:
::with open(output_file, mode='w', encoding='utf-8', newline='') as outfile:
            writer = csv.DictWriter(outfile, fieldnames=fieldnames)
:::writer = csv.DictWriter(outfile, fieldnames=fieldnames)
            writer.writeheader()
:::writer.writeheader()
           
:::
            # Process each row in the input file
:::# Process each row in the input file
            for row in reader:
:::for row in reader:
                text = row['text']
::::text = row['text']
                words = text.split()  # Split the text into words
::::words = text.split()  # Split the text into words
               
::::
                for word_index, word in enumerate(words, start=1):  # Word count starts from 1
::::for word_index, word in enumerate(words, start=1):  # Word count starts from 1
                    # Remove nikud from Hebrew words
:::::# Remove nikud from Hebrew words
                    word_without_nikud = remove_nikud(word)
:::::word_without_nikud = remove_nikud(word)
                   
:::::
                    # Create a new row for each word
:::::# Create a new row for each word
                    new_row = {
:::::new_row = {
                        'id': row['id'],
::::::'id': row['id'],
                        'book': row['book'],
::::::'book': row['book'],
                        'chapter': row['chapter'],
::::::'chapter': row['chapter'],
                        'verse': row['verse'],
::::::'verse': row['verse'],
                        'word': word_index,
::::::'word': word_index,
                        'text': word_without_nikud
::::::'text': word_without_nikud
                    }
:::::}
                   
:::::
                    # Write the new row to the output file
:::::# Write the new row to the output file
                    writer.writerow(new_row)
:::::writer.writerow(new_row)


# Call the function to transform the data
# Call the function to transform the data
Line 148: Line 148:
Changes made:
Changes made:


    Remove Nikud Function: A function remove_nikud() is added that uses a regular expression to remove Hebrew diacritical marks (nikud) from the words. The pattern [\u0591-\u05C7] captures the Unicode range for Hebrew nikud marks.
:Remove Nikud Function: A function remove_nikud() is added that uses a regular expression to remove Hebrew diacritical marks (nikud) from the words. The pattern [\u0591-\u05C7] captures the Unicode range for Hebrew nikud marks.


    Word Processing: Before adding each word to the new row in the CSV, the script passes the word through the remove_nikud() function to strip away any diacritics.
:Word Processing: Before adding each word to the new row in the CSV, the script passes the word through the remove_nikud() function to strip away any diacritics.


How it works:
How it works:


    Nikud Removal: The script strips nikud from each Hebrew word by using a regular expression to match and remove characters in the Unicode range for Hebrew diacritics (U+0591 to U+05C7).
:Nikud Removal: The script strips nikud from each Hebrew word by using a regular expression to match and remove characters in the Unicode range for Hebrew diacritics (U+0591 to U+05C7).
    The rest of the logic remains the same, with each word being processed and outputted to the new CSV file, now without nikud.
:The rest of the logic remains the same, with each word being processed and outputted to the new CSV file, now without nikud.


Example:
Example:
Line 184: Line 184:
No problem! To parse the text column specifically for Hebrew words and create new rows for each Hebrew word, we'll need to:
No problem! To parse the text column specifically for Hebrew words and create new rows for each Hebrew word, we'll need to:


    Identify Hebrew words within the text column.
:Identify Hebrew words within the text column.
    Create a new row for each Hebrew word, similar to how we were creating rows for each word before, but this time only for the Hebrew words.
:Create a new row for each Hebrew word, similar to how we were creating rows for each word before, but this time only for the Hebrew words.


Here's an updated version of the script that focuses on identifying and processing Hebrew words:
Here's an updated version of the script that focuses on identifying and processing Hebrew words:
Line 198: Line 198:
# Function to remove nikud (diacritics) from Hebrew text
# Function to remove nikud (diacritics) from Hebrew text
def remove_nikud(hebrew_text):
def remove_nikud(hebrew_text):
    # Regular expression to remove nikud (diacritics)
:# Regular expression to remove nikud (diacritics)
    nikud_pattern = r'[\u0591-\u05C7]'
:nikud_pattern = r'[\u0591-\u05C7]'
    return re.sub(nikud_pattern, '', hebrew_text)
:return re.sub(nikud_pattern, '', hebrew_text)


# Function to extract Hebrew words from text
# Function to extract Hebrew words from text
def extract_hebrew_words(text):
def extract_hebrew_words(text):
    # Regular expression to find Hebrew words
:# Regular expression to find Hebrew words
    hebrew_pattern = r'[\u0590-\u05FF]+'
:hebrew_pattern = r'[\u0590-\u05FF]+'
    return re.findall(hebrew_pattern, text)
:return re.findall(hebrew_pattern, text)


# Function to parse the CSV and transform data
# Function to parse the CSV and transform data
def transform_bible_data(input_file, output_file):
def transform_bible_data(input_file, output_file):
    with open(input_file, mode='r', encoding='utf-8') as infile:
:with open(input_file, mode='r', encoding='utf-8') as infile:
        reader = csv.DictReader(infile)
::reader = csv.DictReader(infile)
       
::
        # Prepare the fieldnames for the output CSV
::# Prepare the fieldnames for the output CSV
        fieldnames = ['id', 'book', 'chapter', 'verse', 'word', 'text']
::fieldnames = ['id', 'book', 'chapter', 'verse', 'word', 'text']
       
::
        # Open the output file to write the transformed data
::# Open the output file to write the transformed data
        with open(output_file, mode='w', encoding='utf-8', newline='') as outfile:
::with open(output_file, mode='w', encoding='utf-8', newline='') as outfile:
            writer = csv.DictWriter(outfile, fieldnames=fieldnames)
:::writer = csv.DictWriter(outfile, fieldnames=fieldnames)
            writer.writeheader()
:::writer.writeheader()
           
:::
            # Process each row in the input file
:::# Process each row in the input file
            for row in reader:
:::for row in reader:
                text = row['text']
::::text = row['text']
                hebrew_words = extract_hebrew_words(text)  # Extract Hebrew words
::::hebrew_words = extract_hebrew_words(text)  # Extract Hebrew words
               
::::
                for word_index, word in enumerate(hebrew_words, start=1):  # Word count starts from 1
::::for word_index, word in enumerate(hebrew_words, start=1):  # Word count starts from 1
                    # Remove nikud from Hebrew words
:::::# Remove nikud from Hebrew words
                    word_without_nikud = remove_nikud(word)
:::::word_without_nikud = remove_nikud(word)
                   
:::::
                    # Create a new row for each Hebrew word
:::::# Create a new row for each Hebrew word
                    new_row = {
:::::new_row = {
                        'id': row['id'],
::::::'id': row['id'],
                        'book': row['book'],
::::::'book': row['book'],
                        'chapter': row['chapter'],
::::::'chapter': row['chapter'],
                        'verse': row['verse'],
::::::'verse': row['verse'],
                        'word': word_index,
::::::'word': word_index,
                        'text': word_without_nikud
::::::'text': word_without_nikud
                    }
:::::}
                   
:::::
                    # Write the new row to the output file
:::::# Write the new row to the output file
                    writer.writerow(new_row)
:::::writer.writerow(new_row)


# Call the function to transform the data
# Call the function to transform the data
Line 250: Line 250:
Explanation of Changes:
Explanation of Changes:


    Extract Hebrew Words:
:Extract Hebrew Words:
        The function extract_hebrew_words(text) uses a regular expression r'[\u0590-\u05FF]+' to match sequences of Hebrew characters in the text. This is done using re.findall(), which returns all the Hebrew words found in the text.
::The function extract_hebrew_words(text) uses a regular expression r'[\u0590-\u05FF]+' to match sequences of Hebrew characters in the text. This is done using re.findall(), which returns all the Hebrew words found in the text.
    Processing Hebrew Words:
:Processing Hebrew Words:
        We now only process the Hebrew words found in the text. The script will create a new row for each Hebrew word, just as before, but this time only for words that match the Hebrew Unicode range.
::We now only process the Hebrew words found in the text. The script will create a new row for each Hebrew word, just as before, but this time only for words that match the Hebrew Unicode range.
    Nikud Removal:
:Nikud Removal:
        The script still removes nikud from each Hebrew word with the remove_nikud() function.
::The script still removes nikud from each Hebrew word with the remove_nikud() function.


How it works:
How it works:


    Hebrew Word Extraction: The extract_hebrew_words() function finds all Hebrew words in the text column by looking for sequences of Hebrew characters (Unicode range \u0590-\u05FF).
:Hebrew Word Extraction: The extract_hebrew_words() function finds all Hebrew words in the text column by looking for sequences of Hebrew characters (Unicode range \u0590-\u05FF).
    Word Processing: The script loops through the Hebrew words, removes nikud, and creates a row for each word.
:Word Processing: The script loops through the Hebrew words, removes nikud, and creates a row for each word.
    Writing Output: The output CSV file will contain a new row for each Hebrew word in the original text.
:Writing Output: The output CSV file will contain a new row for each Hebrew word in the original text.


Example:
Example:
Line 278: Line 278:
Notes:
Notes:


    The text will be parsed for Hebrew words and will create rows for each Hebrew word found.
:The text will be parsed for Hebrew words and will create rows for each Hebrew word found.
    Words like "בְּרֵאשִׁית" and "הָאָרֶץ" will be handled, and if they have nikud, the nikud will be removed in the output.
:Words like "בְּרֵאשִׁית" and "הָאָרֶץ" will be handled, and if they have nikud, the nikud will be removed in the output.


Make sure your input file contains Hebrew text in the text column, and this script will process and create the desired output.
Make sure your input file contains Hebrew text in the text column, and this script will process and create the desired output.
Line 300: Line 300:
# Function to remove nikud (diacritics) from Hebrew text
# Function to remove nikud (diacritics) from Hebrew text
def remove_nikud(hebrew_text):
def remove_nikud(hebrew_text):
    # Regular expression to remove nikud (diacritics)
:# Regular expression to remove nikud (diacritics)
    nikud_pattern = r'[\u0591-\u05C7]'
:nikud_pattern = r'[\u0591-\u05C7]'
    return re.sub(nikud_pattern, '', hebrew_text)
:return re.sub(nikud_pattern, '', hebrew_text)


# Function to extract Hebrew words in the order they appear in the text
# Function to extract Hebrew words in the order they appear in the text
def extract_hebrew_words_in_order(text):
def extract_hebrew_words_in_order(text):
    # Regular expression to match Hebrew words
:# Regular expression to match Hebrew words
    hebrew_pattern = r'[\u0590-\u05FF]+'
:hebrew_pattern = r'[\u0590-\u05FF]+'
    return re.findall(hebrew_pattern, text)
:return re.findall(hebrew_pattern, text)


# Function to parse the CSV and transform data
# Function to parse the CSV and transform data
def transform_bible_data(input_file, output_file):
def transform_bible_data(input_file, output_file):
    with open(input_file, mode='r', encoding='utf-8') as infile:
:with open(input_file, mode='r', encoding='utf-8') as infile:
        reader = csv.DictReader(infile)
::reader = csv.DictReader(infile)
       
::
        # Prepare the fieldnames for the output CSV
::# Prepare the fieldnames for the output CSV
        fieldnames = ['id', 'book', 'chapter', 'verse', 'word', 'text']
::fieldnames = ['id', 'book', 'chapter', 'verse', 'word', 'text']
       
::
        # Open the output file to write the transformed data
::# Open the output file to write the transformed data
        with open(output_file, mode='w', encoding='utf-8', newline='') as outfile:
::with open(output_file, mode='w', encoding='utf-8', newline='') as outfile:
            writer = csv.DictWriter(outfile, fieldnames=fieldnames)
:::writer = csv.DictWriter(outfile, fieldnames=fieldnames)
            writer.writeheader()
:::writer.writeheader()
           
:::
            # Process each row in the input file
:::# Process each row in the input file
            for row in reader:
:::for row in reader:
                text = row['text']
::::text = row['text']
                hebrew_words = extract_hebrew_words_in_order(text)  # Extract Hebrew words in order
::::hebrew_words = extract_hebrew_words_in_order(text)  # Extract Hebrew words in order
               
::::
                for word_index, word in enumerate(hebrew_words, start=1):  # Word count starts from 1
::::for word_index, word in enumerate(hebrew_words, start=1):  # Word count starts from 1
                    # Remove nikud from Hebrew words
:::::# Remove nikud from Hebrew words
                    word_without_nikud = remove_nikud(word)
:::::word_without_nikud = remove_nikud(word)
                   
:::::
                    # Create a new row for each Hebrew word
:::::# Create a new row for each Hebrew word
                    new_row = {
:::::new_row = {
                        'id': row['id'],
::::::'id': row['id'],
                        'book': row['book'],
::::::'book': row['book'],
                        'chapter': row['chapter'],
::::::'chapter': row['chapter'],
                        'verse': row['verse'],
::::::'verse': row['verse'],
                        'word': word_index,
::::::'word': word_index,
                        'text': word_without_nikud
::::::'text': word_without_nikud
                    }
:::::}
                   
:::::
                    # Write the new row to the output file
:::::# Write the new row to the output file
                    writer.writerow(new_row)
:::::writer.writerow(new_row)


# Call the function to transform the data
# Call the function to transform the data
Line 352: Line 352:
Changes and Explanation:
Changes and Explanation:


    Preserving Word Order:
:Preserving Word Order:
        The extract_hebrew_words_in_order() function now uses the regular expression r'[\u0590-\u05FF]+' to find and return Hebrew words in the order they appear in the text column. This ensures that we process Hebrew words in the exact sequence as they are found in the text, without any rearrangement.
::The extract_hebrew_words_in_order() function now uses the regular expression r'[\u0590-\u05FF]+' to find and return Hebrew words in the order they appear in the text column. This ensures that we process Hebrew words in the exact sequence as they are found in the text, without any rearrangement.


    Hebrew Words Processing:
:Hebrew Words Processing:
        After extracting the Hebrew words, we iterate over them in the order they were found, starting from word_index = 1 for the first Hebrew word in the verse. Each Hebrew word is stripped of nikud using the remove_nikud() function and written to the output file.
::After extracting the Hebrew words, we iterate over them in the order they were found, starting from word_index = 1 for the first Hebrew word in the verse. Each Hebrew word is stripped of nikud using the remove_nikud() function and written to the output file.


    Word Order in Output:
:Word Order in Output:
        The output file will have rows for each Hebrew word, and the words will appear in the same order as they appear in the original text, maintaining the correct sequence.
::The output file will have rows for each Hebrew word, and the words will appear in the same order as they appear in the original text, maintaining the correct sequence.


Example Input:
Example Input:
Line 378: Line 378:
How It Works:
How It Works:


    Text Parsing: The script reads the text column, extracts Hebrew words (using the regex), and processes them in the order they appear.
:Text Parsing: The script reads the text column, extracts Hebrew words (using the regex), and processes them in the order they appear.
    Hebrew Words: It then creates a new row for each Hebrew word, and the word column keeps track of the word's position in the verse.
:Hebrew Words: It then creates a new row for each Hebrew word, and the word column keeps track of the word's position in the verse.
    Nikud Removal: Nikud is removed from each Hebrew word before it's written to the output file.
:Nikud Removal: Nikud is removed from each Hebrew word before it's written to the output file.


This solution will ensure that Hebrew words are parsed and outputted in the correct order as found in the original text, with each Hebrew word appearing on a separate row.
This solution will ensure that Hebrew words are parsed and outputted in the correct order as found in the original text, with each Hebrew word appearing on a separate row.
Line 394: Line 394:
# Function to remove nikud (diacritics) from Hebrew text
# Function to remove nikud (diacritics) from Hebrew text
def remove_nikud(hebrew_text):
def remove_nikud(hebrew_text):
    # Regular expression to remove nikud (diacritics)
:# Regular expression to remove nikud (diacritics)
    nikud_pattern = r'[\u0591-\u05C7]'
:nikud_pattern = r'[\u0591-\u05C7]'
    return re.sub(nikud_pattern, '', hebrew_text)
:return re.sub(nikud_pattern, '', hebrew_text)


# Function to extract Hebrew words in the order they appear in the text
# Function to extract Hebrew words in the order they appear in the text
def extract_hebrew_words_in_order(text):
def extract_hebrew_words_in_order(text):
    # Regular expression to match Hebrew words
:# Regular expression to match Hebrew words
    hebrew_pattern = r'[\u0590-\u05FF]+'
:hebrew_pattern = r'[\u0590-\u05FF]+'
    return re.findall(hebrew_pattern, text)
:return re.findall(hebrew_pattern, text)


# Function to parse the CSV and transform data
# Function to parse the CSV and transform data
def transform_bible_data(input_file, output_file):
def transform_bible_data(input_file, output_file):
    with open(input_file, mode='r', encoding='utf-8') as infile:
:with open(input_file, mode='r', encoding='utf-8') as infile:
        reader = csv.DictReader(infile)
::reader = csv.DictReader(infile)
       
::
        # Prepare the fieldnames for the output CSV
::# Prepare the fieldnames for the output CSV
        fieldnames = ['id', 'book', 'chapter', 'verse', 'word', 'text']
::fieldnames = ['id', 'book', 'chapter', 'verse', 'word', 'text']
       
::
        # Open the output file to write the transformed data
::# Open the output file to write the transformed data
        with open(output_file, mode='w', encoding='utf-8', newline='') as outfile:
::with open(output_file, mode='w', encoding='utf-8', newline='') as outfile:
            writer = csv.DictWriter(outfile, fieldnames=fieldnames)
:::writer = csv.DictWriter(outfile, fieldnames=fieldnames)
            writer.writeheader()
:::writer.writeheader()
           
:::
            # Process each row in the input file
:::# Process each row in the input file
            for row in reader:
:::for row in reader:
                text = row['text']
::::text = row['text']
                hebrew_words = extract_hebrew_words_in_order(text)  # Extract Hebrew words in order
::::hebrew_words = extract_hebrew_words_in_order(text)  # Extract Hebrew words in order
               
::::
                for word_index, word in enumerate(hebrew_words, start=1):  # Word count starts from 1
::::for word_index, word in enumerate(hebrew_words, start=1):  # Word count starts from 1
                    # Remove nikud from Hebrew words
:::::# Remove nikud from Hebrew words
                    word_without_nikud = remove_nikud(word)
:::::word_without_nikud = remove_nikud(word)
                   
:::::
                    # Create a new row for each Hebrew word
:::::# Create a new row for each Hebrew word
                    new_row = {
:::::new_row = {
                        'id': row['id'],
::::::'id': row['id'],
                        'book': row['book'],
::::::'book': row['book'],
                        'chapter': row['chapter'],
::::::'chapter': row['chapter'],
                        'verse': row['verse'],
::::::'verse': row['verse'],
                        'word': word_index,
::::::'word': word_index,
                        'text': word_without_nikud
::::::'text': word_without_nikud
                    }
:::::}
                   
:::::
                    # Write the new row to the output file
:::::# Write the new row to the output file
                    writer.writerow(new_row)
:::::writer.writerow(new_row)


# Call the function to transform the data
# Call the function to transform the data

Navigation menu