"""
# Exemple d'indexation de documents du JORF avec SOLR

Installation de Solr et création d'un *core* appelé `jorf`:

```bash
# Création d'un conteneur et démarrage
docker run --name cours_nlp -d -p 8983:8983 -t solr
# Création d'un "core" (ensemble logique d'indexes)
docker exec -it --user=solr cours_nlp bin/solr create_core -c jorf
```

Création d'un environnement virtuel Python et installation de `pysolr`:

```bash
python3.11 -m venv ~/venv/demo_nlp_solr
source ~/venv/demo_nlp_solr/bin/activate
pip install -U pip
pip install pysolr
```

Indexation d'une année de JORF (2023), avec le fichier proposé pour le TP2:

```bash
python solr_index_jorf.py jorf_2023.csv
```
"""
import sys
import csv

import pysolr


def main():
    file_to_index = sys.argv[1]
    with open(file_to_index, newline='') as csvfile:
        alineareader = csv.reader(csvfile, delimiter='|')
        current_text_id = None
        current_version_id = None
        current_article_id = None
        current_document = None
        indexation_pool = list()
        alineas = None

        solr = pysolr.Solr('http://localhost:8983/solr/jorf/', always_commit=True)

        for i, row in enumerate(alineareader):
            text_id = row[0]
            if current_text_id is None or text_id != current_text_id:
                if current_document is not None:
                    assert alineas is not None
                    current_document['content_txt_fr'] = '\n\n'.join(alineas)
                alineas = list()
                current_document = dict(text_id_s=text_id)
                indexation_pool.append(current_document)
                current_text_id = text_id
            child_id = row[1]
            content = row[-1]
            if not child_id:
                current_document['id'] = content  # URI du texte
            elif child_id.startswith("JORFVERS"):
                current_document['title_txt_fr'] = content  # Titre du texte
            else:
                alineas.append(content)

            # Indexe tous les 100 documents
            if (i % 100) == 0:
                solr.add(indexation_pool)
                indexation_pool = list()  # Reset indexation pool

        if indexation_pool:
            solr.add(indexation_pool)

        return 0

if __name__ == "__main__":
    sys.exit(main())