ai_english/import_cet4_words.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""导入大学英语四级核心词汇到数据库"""

import pandas as pd
import mysql.connector
from datetime import datetime
import json

# 数据库配置
db_config = {
    'host': 'localhost',
    'port': 3306,
    'user': 'root',
    'password': 'JKjk20011115',
    'database': 'ai_english_learning',
    'charset': 'utf8mb4'
}

# 词汇书ID
BOOK_ID = 'cet4_core_2500'

def clean_text(text):
    """清理文本，处理nan和空值"""
    if pd.isna(text) or str(text).strip() == '' or str(text).strip() == 'nan':
        return None
    return str(text).strip()

def extract_part_of_speech(translation):
    """从中文翻译中提取词性"""
    if not translation:
        return 'noun'

    pos_map = {
        'v.': 'verb',
        'n.': 'noun',
        'adj.': 'adjective',
        'adv.': 'adverb',
        'prep.': 'preposition',
        'conj.': 'conjunction',
        'pron.': 'pronoun',
        'interj.': 'interjection'
    }

    for abbr, full in pos_map.items():
        if abbr in translation or abbr.replace('.', '') in translation:
            return full

    # 中文词性判断
    if '动' in translation:
        return 'verb'
    elif '形' in translation or '容' in translation:
        return 'adjective'
    elif '副' in translation:
        return 'adverb'
    elif '介' in translation:
        return 'preposition'
    elif '连' in translation:
        return 'conjunction'

    return 'noun'

def import_words_from_excel(file_path):
    """从Excel导入单词"""
    try:
        print(f"📖 正在读取文件: {file_path}")
        df = pd.read_excel(file_path)

        print(f"📊 文件列名: {df.columns.tolist()}")
        print(f"📊 总行数: {len(df)}")

        conn = mysql.connector.connect(**db_config)
        cursor = conn.cursor()

        # 清理旧数据
        print("\n清理旧数据...")
        cursor.execute("DELETE FROM ai_vocabulary_book_words WHERE book_id = %s", (BOOK_ID,))
        cursor.execute("""
            DELETE v FROM ai_vocabulary v
            LEFT JOIN ai_vocabulary_book_words bw ON bw.vocabulary_id = v.id
            WHERE bw.id IS NULL
        """)
        conn.commit()

        # SQL语句
        insert_vocab_sql = """
        INSERT INTO ai_vocabulary
        (word, phonetic_us, phonetic_uk, phonetic, level, frequency, is_active,
         word_root, synonyms, antonyms, derivatives, collocations, created_at, updated_at)
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
        """

        insert_definition_sql = """
        INSERT INTO ai_vocabulary_definitions
        (vocabulary_id, part_of_speech, definition_en, definition_cn, sort_order, created_at)
        VALUES (%s, %s, %s, %s, %s, %s)
        """

        insert_example_sql = """
        INSERT INTO ai_vocabulary_examples
        (vocabulary_id, sentence_en, sentence_cn, sort_order, created_at)
        VALUES (%s, %s, %s, %s, %s)
        """

        insert_book_word_sql = """
        INSERT INTO ai_vocabulary_book_words
        (book_id, vocabulary_id, sort_order, created_at)
        VALUES (%s, %s, %s, %s)
        """

        success_count = 0
        error_count = 0

        for index, row in df.iterrows():
            try:
                # 尝试多个可能的列名
                word = clean_text(row.get('Word') or row.get('单词(Word)') or row.get('单词'))
                if not word:
                    continue

                # 检查单词是否已存在
                cursor.execute("SELECT id FROM ai_vocabulary WHERE word = %s", (word,))
                existing = cursor.fetchone()

                if existing:
                    # 单词已存在，使用现有ID
                    vocab_id = existing[0]
                else:
                    # 单词不存在，插入新记录
                    # 音标
                    phonetic_us = clean_text(row.get('美式音标'))
                    phonetic_uk = clean_text(row.get('英式音标'))
                    phonetic = phonetic_us or phonetic_uk

                    # 释义
                    translation_cn = clean_text(row.get('中文含义'))
                    translation_en = clean_text(row.get('英文翻译（对应中文含义）') or row.get('英文翻译'))

                    if not translation_cn:
                        print(f"⚠️  跳过 {word}：缺少中文含义")
                        continue

                    if not translation_en:
                        translation_en = word

                    part_of_speech = extract_part_of_speech(translation_cn)

                    # 例句
                    example_en = clean_text(row.get('例句'))
                    example_cn = clean_text(row.get('例句中文翻译'))

                    # 词根
                    word_root = clean_text(row.get('词根') or row.get('词根/词源'))

                    # 同义词
                    synonyms_text = clean_text(row.get('同义词（含义）'))
                    synonyms_json = '[]'
                    if synonyms_text:
                        syn_list = [syn.strip() for syn in synonyms_text.split('；') if syn.strip()]
                        synonyms_json = json.dumps(syn_list, ensure_ascii=False)

                    # 反义词
                    antonyms_text = clean_text(row.get('反义词（含义）'))
                    antonyms_json = '[]'
                    if antonyms_text:
                        ant_list = [ant.strip() for ant in antonyms_text.split('；') if ant.strip()]
                        antonyms_json = json.dumps(ant_list, ensure_ascii=False)

                    # 派生词
                    derivatives_text = clean_text(row.get('派生词（含义）'))
                    derivatives_json = '[]'
                    if derivatives_text:
                        der_list = [der.strip() for der in derivatives_text.split('；') if der.strip()]
                        derivatives_json = json.dumps(der_list, ensure_ascii=False)

                    # 词组搭配
                    phrases_text = clean_text(row.get('词组搭配（中文含义）'))
                    collocations_json = '[]'
                    if phrases_text:
                        col_list = [phrase.strip() for phrase in phrases_text.split('；') if phrase.strip()]
                        collocations_json = json.dumps(col_list, ensure_ascii=False)

                    # 插入词汇
                    now = datetime.now()
                    cursor.execute(insert_vocab_sql, (
                        word, phonetic_us, phonetic_uk, phonetic,
                        'intermediate',  # CET-4难度
                        index + 1, True,
                        word_root, synonyms_json, antonyms_json,
                        derivatives_json, collocations_json,
                        now, now
                    ))

                    vocab_id = cursor.lastrowid

                    # 插入释义
                    cursor.execute(insert_definition_sql, (
                        vocab_id, part_of_speech, translation_en,
                        translation_cn, 0, now
                    ))

                    # 插入例句
                    if example_en and example_cn:
                        examples_en = example_en.split('；')
                        examples_cn = example_cn.split('；')

                        for i, (ex_en, ex_cn) in enumerate(zip(examples_en, examples_cn)):
                            ex_en = ex_en.strip()
                            ex_cn = ex_cn.strip()
                            if ex_en and ex_cn:
                                cursor.execute(insert_example_sql, (
                                    vocab_id, ex_en, ex_cn, i, now
                                ))

                # 关联到词汇书（无论单词是否已存在，都要关联）
                now = datetime.now()
                try:
                    cursor.execute(insert_book_word_sql, (
                        BOOK_ID, vocab_id, index, now
                    ))
                except Exception as link_error:
                    # 如果关联已存在，跳过
                    if '1062' not in str(link_error):  # 不是重复键错误
                        raise

                success_count += 1
                if success_count % 100 == 0:
                    print(f"✅ 已处理 {success_count} 个单词...")
                    conn.commit()

            except Exception as e:
                error_count += 1
                print(f"❌ 导入第 {index + 1} 行失败: {e}")

        conn.commit()

        # 更新词汇书总数
        cursor.execute(
            "UPDATE ai_vocabulary_books SET total_words = %s WHERE id = %s",
            (success_count, BOOK_ID)
        )
        conn.commit()

        print(f"\n🎉 大学英语四级核心词汇导入完成!")
        print(f"✅ 成功: {success_count} 个单词")
        print(f"❌ 失败: {error_count} 个单词")

        # 验证
        cursor.execute(
            "SELECT COUNT(*) FROM ai_vocabulary_book_words WHERE book_id = %s",
            (BOOK_ID,)
        )
        print(f"📊 词汇书中共有 {cursor.fetchone()[0]} 个单词")

    except Exception as e:
        print(f"❌ 导入失败: {e}")
        import traceback
        traceback.print_exc()
    finally:
        if 'cursor' in locals():
            cursor.close()
        if 'conn' in locals():
            conn.close()

if __name__ == '__main__':
    import_words_from_excel('data/大学英语四级核心词汇.xlsx')